From 7a242296ac6cba31783af68ffb918451f8f17245 Mon Sep 17 00:00:00 2001
From: Rafal Wojdyla <ravwojdyla@gmail.com>
Date: Mon, 9 Mar 2026 13:11:11 -0700
Subject: [PATCH 01/56] Stage datakit design doc

---
 docs/design/2355_datakit.md | 218 ++++++++++++++++++++++++++++++++++++
 1 file changed, 218 insertions(+)
 create mode 100644 docs/design/2355_datakit.md

diff --git a/docs/design/2355_datakit.md b/docs/design/2355_datakit.md
new file mode 100644
index 0000000000..7ef15bf46e
--- /dev/null
+++ b/docs/design/2355_datakit.md
@@ -0,0 +1,218 @@
+Marin has most of the pieces for end-to-end data processing \- download, dedup, filtering, classification, decontamination, tokenization \- but the code is scattered across `experiments/` and `lib/marin/` with inconsistent formats, ad-hoc ID handling, and unclear provenance.
+
+We propose consolidating this into **datakit**: a set of composable pipeline stages with standardized formats and conventions, living in `lib/marin/datakit/`. Dataset-specific wiring (e.g., "for Arxiv, apply these transforms") lives in `experiments/` or reference configurations.
+
+
+Links:
+ * [marin\#2355](https://github.com/marin-community/marin/issues/2355)
+ * [gdoc](https://docs.google.com/document/d/1kDSzONg32zv2VnCO4FJiMP0fcjRSjgP0uTDpI4_C4O0)
+
+# Golden Path
+
+The canonical pipeline for getting a dataset from source to training:
+
+`Download → Normalize → Embed → Classify/Filter → Dedup → Tokenize`
+
+Notably, datakit in the proposed form, doesn’t include **data mixing** or **training**.
+
+## 1\. Download
+
+Download raw dataset from Hugging Face (or other sources). Raw downloads are preserved as-is in their original format and directory structure.
+
+## 2\. Normalize to Standard Format
+
+Convert raw data into the **datakit standard format**:
+
+* **File format**: Vortex \- columnar, supports pushdown filters and column projection, efficient lookup.
+* **Mandatory columns**:
+  * `id` \- unique document identifier (see [ID Column](#id-column) below)
+  * `text` \- primary text content \- we enforce UTF-8
+* **Arbitrary additional columns**: any fields present in the raw data are preserved
+* **Directory structure**: preserver original directory structure
+* **Partition structure**: partition layout from the source does NOT need to be preserved at this point \- and in most cases it will not be
+  * We may want to introduce a more efficient partitioning at this stage and preserve the new partitioning until tokenization
+  * The partitions must follow `part-x-of-y` suffix naming convention
+* **Sort invariant**: each partition is sorted by `id`
+* **Typed output:** in the code the data has typed representation via `Artifact`
+
+This is the "intake" step \- all downstream stages operate on normalized Vortex datasets.
+
+## 3\. Embed
+
+Produce vector embeddings for each document. Output is an **attributes dataset** (see [Attributes Datasets](#attributes-datasets)) with embedding vectors keyed by `id`.
+
+## 4\. Quality Classification, Topic Assignment
+
+Each classifier produces an **attributes dataset** containing scores/labels keyed by `id`.
+
+## 5\. Deduplication
+
+Produces an **attributes dataset** marking duplicate spans or documents.
+
+## 7\. Consolidation
+
+Join attributes datasets back to the source documents and apply filters:
+
+* Filter by classifier thresholds (e.g., quality score \> 0.8)
+* Remove duplicate spans/documents
+
+Output is a clean, filtered Vortex dataset \- still sorted by `id`, still co-partitioned.
+
+## 8\. Tokenize
+
+Convert clean text into tokenized Levanter cache format.
+
+**Tokenization is the boundary where per-document structure ends.** The tokenizer concatenates documents into fixed-size token sequences for efficient training. Partition structure from earlier stages does not carry through \- the output is sharded Levanter TreeStore caches with a `.stats.json` summary.
+
+# Core Design Decisions
+
+## Vortex as the Standard Format
+
+All intermediate datasets (from normalization through consolidation) use the Vortex columnar format. Benefits:
+
+* Column projection (only read the columns you need)
+* Filter pushdown
+* Efficient sorted merge joins via Zephyr
+
+NOTE: Vortex is much less mature than Parquet. This is a major concern. We will start with Vortex and if we hit roadblocks, revert to Parquet.
+
+## ID Column {#id-column}
+
+* **Preserve existing IDs** when present in the raw data (e.g., WARC-Record-ID in DCLM, HF row indices). These carry provenance meaning and aid debugging.
+  * But rename column to `source_id`
+* **Generate deterministic IDs** via content hash. Column named `id`. Deterministic hashing ensures reproducibility \- re-running the pipeline produces the same IDs, which preserves caching and diffing.
+
+## Co-Partitioning Invariant
+
+The key invariant that enables efficient joins: **Attributes datasets must have the same number of shards and the same key-range partitioning as their source dataset.**
+
+This means:
+
+* The normalization step determines the partition structure
+* All downstream stages (embed, classify, dedup) preserve this structure \- same shard count, same ID ranges per shard
+* Consolidation can use Zephyr's `sorted_merge_join` without a costly `group_by` shuffle
+
+This is enforced by convention: each processing stage reads source partitions 1:1 and writes output partitions with matching structure.
+
+## Attributes Datasets {#attributes-datasets}
+
+Processing stages (embed, classify, dedup) produce **attributes datasets** \- lightweight Vortex files containing:
+
+* `id` — matching the source document ID
+* Stage-specific output columns (e.g., `quality_score`, `is_duplicate`, `topic_label`)
+
+Attributes datasets:
+
+* Use Vortex format
+* Are co-partitioned with the source (same shard count and key ranges)
+* Are sorted by `id` within each partition
+* Can be joined back to source documents via `sorted_merge_join`
+
+Multiple attribute datasets from different stages can be joined together during consolidation to apply compound filters.
+
+## Step Orchestration via StepSpec
+
+Datakit builds on `StepSpec` \- the pure-data step descriptor that captures identity, dependencies. Each datakit stage (normalize, classify, dedup, etc.) is a `StepSpec` with:
+
+* **`name`**: human-readable stage name (e.g., `"fineweb/normalize"`)
+* **`deps`**: upstream `StepSpec`s whose `output_path` this stage reads from
+* **`hash_attrs`**: configuration values that affect output (model name, thresholds, etc.) — changes invalidate the cache
+* **`fn`**: the callable that performs the work, receiving `output_path` as its argument
+
+`StepSpec` gives us automatic cache invalidation (via `hash_id` derived from name \+ attrs \+ dep paths), dependency tracking, and deterministic output paths. The step runner handles locking, heartbeats, and status \- datakit stages just describe what to run.
+
+Example wiring:
+
+```py
+download = StepSpec(
+    name="fineweb/download",
+    fn=lambda output_path: download_hf(output_path=output_path, dataset_id="HuggingFaceFW/fineweb"),
+    hash_attrs={"dataset_id": "HuggingFaceFW/fineweb", "revision": "abc1234"},
+)
+
+normalize = StepSpec(
+    name="fineweb/normalize",
+    deps=[download],
+    fn=lambda output_path: normalize_to_vortex(
+        input_path=download.output_path, output_path=output_path, text_field="text",
+    ),
+    hash_attrs={"text_field": "text"},
+)
+
+quality = StepSpec(
+    name="fineweb/quality",
+    deps=[normalize],
+    fn=lambda output_path: classify(
+        input_path=normalize.output_path, output_path=output_path, model="fasttext-quality-v1",
+    ),
+    hash_attrs={"model": "fasttext-quality-v1"},
+)
+
+dedup = StepSpec(
+    name="fineweb/dedup",
+    deps=[normalize],
+    fn=lambda output_path: deduplicate(
+        input_path=normalize.output_path, output_path=output_path, mode="fuzzy_document",
+    ),
+    hash_attrs={"mode": "fuzzy_document"},
+)
+
+consolidated = StepSpec(
+    name="fineweb/consolidated",
+    deps=[normalize, quality, dedup],
+    fn=lambda output_path: consolidate(
+        source_path=normalize.output_path,
+        attribute_paths=[quality.output_path, dedup.output_path],
+        output_path=output_path,
+        quality_threshold=0.8,
+    ),
+    hash_attrs={"quality_threshold": 0.8},
+)
+
+tokenized = StepSpec(
+    name="fineweb/tokenized",
+    deps=[consolidated],
+    fn=lambda output_path: tokenize(
+        input_path=consolidated.output_path, output_path=output_path,
+        tokenizer="meta-llama/Llama-3.1-8B",
+    ),
+    hash_attrs={"tokenizer": "meta-llama/Llama-3.1-8B"},
+)
+```
+
+# API Surface
+
+## `lib/marin/datakit/`
+
+Core primitives — the reusable building blocks:
+
+```
+lib/marin/datakit/
+  normalize       # Raw format -> standard Vortex (id, text, ...)
+  embed           # Document embedding
+  classify        # Quality/topic classification
+  dedup           # Deduplication (exact + fuzzy)
+  consolidate     # Join attributes + apply filters
+```
+
+## `experiments/` (or reference configurations)
+
+Dataset-specific wiring \- which transforms to apply for a given dataset, expressed as `StepSpec` DAGs.
+
+# Execution Plan
+
+* Implement `datakit/normalize.py` \- standard schema definitions, ID generation, raw format to Vortex conversion with mandatory columns
+* Integration tests for the normalize step
+* Integration tests covering download, normalize, dedup and tokenize at reasonable scale
+* Update Grug/ferry experiment definitions to consume datakit pipeline outputs directly
+
+# Non-Goals
+
+* **Replacing the mixing or training APIs** \- datakit standardizes everything upstream of tokenization.
+* **Supporting non-text modalities** \- the initial scope is text datasets with a mandatory `text` field. Multimodal support can be added later by relaxing this constraint.
+
+# Open Questions
+
+1. **ID uniqueness enforcement**: Per-partition validation is cheap and will be the default. Should we also support global uniqueness checks? What's the failure mode — warn or error?
+2. **Non-text datasets**: Code datasets, structured data \- do we need a configurable primary field, or is `text` always sufficient?
+3. **Versioning**: How do we version datakit outputs so that downstream consumers (Grug) can pin to a specific processing run? `StepSpec.hash_id` provides content-based versioning, but do we need human-readable version tags as well?

From 2d4ab402fd31634941a513571b88611aaf9d88ed Mon Sep 17 00:00:00 2001
From: Rafal Wojdyla <ravwojdyla@gmail.com>
Date: Mon, 23 Mar 2026 16:27:03 -0700
Subject: [PATCH 02/56] Add datakit download, normalize, and tokenize modules
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Implements the first three stages of the datakit pipeline per the design
doc (#2355): download_step wraps download_hf, normalize converts raw
files to sorted/deduped Parquet with content-hash IDs, and tokenize_step
wraps the existing tokenizer for Levanter cache output. Integration test
exercises the full DAG (download → normalize → tokenize) via StepRunner.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 lib/marin/src/marin/datakit/__init__.py  |   2 +
 lib/marin/src/marin/datakit/download.py  |  62 ++++++++
 lib/marin/src/marin/datakit/normalize.py | 194 +++++++++++++++++++++++
 lib/marin/src/marin/datakit/tokenize.py  |  71 +++++++++
 tests/datakit/__init__.py                |   2 +
 tests/datakit/test_datakit.py            |  76 +++++++++
 6 files changed, 407 insertions(+)
 create mode 100644 lib/marin/src/marin/datakit/__init__.py
 create mode 100644 lib/marin/src/marin/datakit/download.py
 create mode 100644 lib/marin/src/marin/datakit/normalize.py
 create mode 100644 lib/marin/src/marin/datakit/tokenize.py
 create mode 100644 tests/datakit/__init__.py
 create mode 100644 tests/datakit/test_datakit.py

diff --git a/lib/marin/src/marin/datakit/__init__.py b/lib/marin/src/marin/datakit/__init__.py
new file mode 100644
index 0000000000..ec8bc038b7
--- /dev/null
+++ b/lib/marin/src/marin/datakit/__init__.py
@@ -0,0 +1,2 @@
+# Copyright The Marin Authors
+# SPDX-License-Identifier: Apache-2.0
diff --git a/lib/marin/src/marin/datakit/download.py b/lib/marin/src/marin/datakit/download.py
new file mode 100644
index 0000000000..0724472143
--- /dev/null
+++ b/lib/marin/src/marin/datakit/download.py
@@ -0,0 +1,62 @@
+# Copyright The Marin Authors
+# SPDX-License-Identifier: Apache-2.0
+
+"""Datakit download stage — fetch a HuggingFace dataset to persistent storage."""
+
+from marin.download.huggingface.download_hf import DownloadConfig, download_hf
+from marin.execution.step_spec import StepSpec
+
+
+def download_step(
+    name: str,
+    *,
+    hf_dataset_id: str,
+    revision: str,
+    hf_urls_glob: list[str] | None = None,
+    zephyr_max_parallelism: int = 8,
+    deps: list[StepSpec] | None = None,
+    output_path_prefix: str | None = None,
+    override_output_path: str | None = None,
+) -> StepSpec:
+    """Create a StepSpec that downloads a HuggingFace dataset.
+
+    The raw download is preserved as-is in its original format and directory structure.
+
+    Args:
+        name: Step name (e.g. "fineweb/download").
+        hf_dataset_id: HuggingFace dataset identifier (e.g. "HuggingFaceFW/fineweb").
+        revision: Commit hash from the HF dataset repo.
+        hf_urls_glob: Glob patterns to select specific files. Empty means all files.
+        zephyr_max_parallelism: Maximum download parallelism.
+        deps: Optional upstream dependencies.
+        output_path_prefix: Override the default output path prefix.
+        override_output_path: Override the computed output path entirely.
+
+    Returns:
+        A StepSpec whose output_path contains the raw downloaded files.
+    """
+    resolved_glob = hf_urls_glob or []
+
+    def _run(output_path: str) -> None:
+        download_hf(
+            DownloadConfig(
+                hf_dataset_id=hf_dataset_id,
+                revision=revision,
+                hf_urls_glob=resolved_glob,
+                gcs_output_path=output_path,
+                zephyr_max_parallelism=zephyr_max_parallelism,
+            )
+        )
+
+    return StepSpec(
+        name=name,
+        fn=_run,
+        deps=deps or [],
+        hash_attrs={
+            "hf_dataset_id": hf_dataset_id,
+            "revision": revision,
+            "hf_urls_glob": resolved_glob,
+        },
+        output_path_prefix=output_path_prefix,
+        override_output_path=override_output_path,
+    )
diff --git a/lib/marin/src/marin/datakit/normalize.py b/lib/marin/src/marin/datakit/normalize.py
new file mode 100644
index 0000000000..bace847696
--- /dev/null
+++ b/lib/marin/src/marin/datakit/normalize.py
@@ -0,0 +1,194 @@
+# Copyright The Marin Authors
+# SPDX-License-Identifier: Apache-2.0
+
+"""Datakit normalize stage — convert raw data into the datakit standard Parquet format.
+
+The normalize step is the "intake" for the datakit pipeline. It reads raw files
+(JSONL, Parquet, or other formats supported by Zephyr), enforces a standard
+schema (mandatory ``id`` and ``text`` columns), and writes co-partitioned,
+sorted Parquet files.
+
+Key guarantees after normalization:
+- Every record has a deterministic ``id`` (SHA-256 of the text content).
+- If the source data has an existing ID field, it is preserved as ``source_id``.
+- Text is present and UTF-8 encoded.
+- Each output partition is sorted by ``id``.
+- Output files follow the ``part-{shard:05d}-of-{total:05d}.parquet`` naming convention.
+"""
+
+import hashlib
+import logging
+import os
+from collections.abc import Iterator
+
+from marin.execution.artifact import PathsMetadata
+from marin.execution.step_spec import StepSpec
+from marin.utils import fsspec_glob
+from zephyr import Dataset, ShardInfo, ZephyrContext
+from zephyr.readers import load_file
+
+logger = logging.getLogger(__name__)
+
+DEFAULT_TEXT_FIELD = "text"
+
+
+def content_hash_id(text: str) -> str:
+    """Generate a deterministic document ID from text content.
+
+    Uses SHA-256 truncated to 16 hex characters for a compact but
+    collision-resistant identifier.
+    """
+    return hashlib.sha256(text.encode("utf-8")).hexdigest()[:16]
+
+
+def _discover_input_files(input_path: str) -> list[str]:
+    """Find all supported input files under input_path, excluding dotfiles/directories."""
+    extensions = ["jsonl.gz", "jsonl.zst", "jsonl.zstd", "jsonl", "parquet", "vortex"]
+    files: list[str] = []
+    for ext in extensions:
+        files.extend(fsspec_glob(os.path.join(input_path, f"**/*.{ext}")))
+    # Exclude hidden directories (e.g. .metrics/ written by download_hf)
+    files = [f for f in files if "/." not in f.split(input_path, 1)[-1]]
+    if not files:
+        raise ValueError(f"No supported input files found under {input_path}")
+    return sorted(files)
+
+
+def _normalize_record(record: dict, text_field: str, source_id_field: str | None) -> dict:
+    """Transform a single record into datakit standard format.
+
+    - Extracts and renames the text field to ``text``.
+    - Generates a deterministic ``id`` from the text content.
+    - Preserves the original ID (if any) as ``source_id``.
+    - Preserves all other fields.
+    """
+    text = record.get(text_field)
+    if text is None:
+        raise ValueError(f"Record missing required text field {text_field!r}: {list(record.keys())}")
+    if not isinstance(text, str):
+        text = str(text)
+
+    doc_id = content_hash_id(text)
+
+    normalized: dict = {"id": doc_id, "text": text}
+
+    if source_id_field is not None and source_id_field in record:
+        normalized["source_id"] = str(record[source_id_field])
+
+    # Preserve additional columns
+    skip_fields = {text_field, source_id_field} if source_id_field else {text_field}
+    for key, value in record.items():
+        if key not in skip_fields and key not in normalized:
+            normalized[key] = value
+
+    return normalized
+
+
+def normalize(
+    input_path: str,
+    output_path: str,
+    *,
+    text_field: str = DEFAULT_TEXT_FIELD,
+    source_id_field: str | None = None,
+    num_output_shards: int | None = None,
+    zephyr_max_workers: int = 64,
+) -> PathsMetadata:
+    """Run the normalize pipeline.
+
+    Reads raw files, transforms each record to the standard schema,
+    repartitions by ``id`` (hash-based), deduplicates, sorts each partition
+    by ``id``, and writes Parquet output files.
+
+    Args:
+        input_path: Path to raw input files.
+        output_path: Directory to write output Parquet files.
+        text_field: Name of the field containing the primary text content.
+        source_id_field: Name of an existing ID field to preserve as ``source_id``.
+        num_output_shards: Number of output Parquet partitions. Defaults to
+            the number of input files.
+        zephyr_max_workers: Maximum Zephyr worker parallelism.
+
+    Returns:
+        PathsMetadata listing the output files.
+    """
+    input_files = _discover_input_files(input_path)
+    logger.info("Normalizing %d input files from %s", len(input_files), input_path)
+
+    shards = num_output_shards or len(input_files)
+
+    def _sort_shard(records: Iterator[dict], _shard_info: ShardInfo) -> Iterator[dict]:
+        batch = list(records)
+        batch.sort(key=lambda r: r["id"])
+        return iter(batch)
+
+    output_pattern = os.path.join(output_path, "part-{shard:05d}-of-{total:05d}.parquet")
+    pipeline = (
+        Dataset.from_list(input_files)
+        .flat_map(load_file)
+        .map(lambda r: _normalize_record(r, text_field, source_id_field))
+        .group_by(
+            key=lambda r: r["id"],
+            reducer=lambda _key, records: next(iter(records)),
+            num_output_shards=shards,
+        )
+        .map_shard(_sort_shard)
+        .write_parquet(output_pattern)
+    )
+
+    ctx = ZephyrContext(name="datakit-normalize", max_workers=min(zephyr_max_workers, shards))
+    output_files = list(ctx.execute(pipeline))
+    logger.info("Wrote %d normalized Parquet partitions to %s", len(output_files), output_path)
+    return PathsMetadata(parent_path=output_path, paths=output_files)
+
+
+def normalize_step(
+    name: str,
+    *,
+    input_path: str,
+    text_field: str = DEFAULT_TEXT_FIELD,
+    source_id_field: str | None = None,
+    num_output_shards: int | None = None,
+    zephyr_max_workers: int = 64,
+    deps: list[StepSpec] | None = None,
+    output_path_prefix: str | None = None,
+    override_output_path: str | None = None,
+) -> StepSpec:
+    """Create a StepSpec for the normalize stage.
+
+    Args:
+        name: Step name (e.g. "fineweb/normalize").
+        input_path: Path to raw input files.
+        text_field: Name of the field containing the primary text content.
+        source_id_field: Name of an existing ID field to preserve as ``source_id``.
+        num_output_shards: Number of output Parquet partitions.
+        zephyr_max_workers: Maximum Zephyr worker parallelism.
+        deps: Upstream dependencies (typically the download step).
+        output_path_prefix: Override the default output path prefix.
+        override_output_path: Override the computed output path entirely.
+
+    Returns:
+        A StepSpec whose output_path contains normalized Parquet files.
+    """
+
+    def _run(step_output_path: str) -> PathsMetadata:
+        return normalize(
+            input_path,
+            step_output_path,
+            text_field=text_field,
+            source_id_field=source_id_field,
+            num_output_shards=num_output_shards,
+            zephyr_max_workers=zephyr_max_workers,
+        )
+
+    return StepSpec(
+        name=name,
+        fn=_run,
+        deps=deps or [],
+        hash_attrs={
+            "input_path": input_path,
+            "text_field": text_field,
+            "source_id_field": source_id_field,
+        },
+        output_path_prefix=output_path_prefix,
+        override_output_path=override_output_path,
+    )
diff --git a/lib/marin/src/marin/datakit/tokenize.py b/lib/marin/src/marin/datakit/tokenize.py
new file mode 100644
index 0000000000..0e5c9b4168
--- /dev/null
+++ b/lib/marin/src/marin/datakit/tokenize.py
@@ -0,0 +1,71 @@
+# Copyright The Marin Authors
+# SPDX-License-Identifier: Apache-2.0
+
+"""Datakit tokenize stage — convert normalized Parquet datasets into Levanter cache format.
+
+This is the final stage of the datakit pipeline. It reads normalized Parquet
+files and produces tokenized training data in Levanter's TreeStore format.
+
+Tokenization is the boundary where per-document structure ends. The tokenizer
+concatenates documents into fixed-size token sequences for efficient training.
+"""
+
+import logging
+
+from marin.execution.step_spec import StepSpec
+from marin.processing.tokenize.tokenize import TokenizeConfig, tokenize
+
+logger = logging.getLogger(__name__)
+
+
+def tokenize_step(
+    name: str,
+    *,
+    input_path: str,
+    tokenizer: str,
+    max_workers: int = 4096,
+    deps: list[StepSpec] | None = None,
+    output_path_prefix: str | None = None,
+    override_output_path: str | None = None,
+) -> StepSpec:
+    """Create a StepSpec that tokenizes a normalized dataset.
+
+    Reads normalized Parquet files and produces Levanter cache format output
+    suitable for training.
+
+    Args:
+        name: Step name (e.g. "fineweb/tokenize").
+        input_path: Path to normalized Parquet files (output of normalize step).
+        tokenizer: HuggingFace tokenizer name (e.g. "meta-llama/Llama-3.1-8B").
+        max_workers: Maximum Zephyr worker parallelism.
+        deps: Upstream dependencies (typically the normalize or consolidate step).
+        output_path_prefix: Override the default output path prefix.
+        override_output_path: Override the computed output path entirely.
+
+    Returns:
+        A StepSpec whose output_path contains the tokenized Levanter cache.
+    """
+
+    def _run(output_path: str) -> None:
+        tokenize(
+            TokenizeConfig(
+                train_paths=[input_path],
+                validation_paths=[],
+                cache_path=output_path,
+                tokenizer=tokenizer,
+                max_workers=max_workers,
+                allow_test_in_train=True,
+            )
+        )
+
+    return StepSpec(
+        name=name,
+        fn=_run,
+        deps=deps or [],
+        hash_attrs={
+            "input_path": input_path,
+            "tokenizer": tokenizer,
+        },
+        output_path_prefix=output_path_prefix,
+        override_output_path=override_output_path,
+    )
diff --git a/tests/datakit/__init__.py b/tests/datakit/__init__.py
new file mode 100644
index 0000000000..ec8bc038b7
--- /dev/null
+++ b/tests/datakit/__init__.py
@@ -0,0 +1,2 @@
+# Copyright The Marin Authors
+# SPDX-License-Identifier: Apache-2.0
diff --git a/tests/datakit/test_datakit.py b/tests/datakit/test_datakit.py
new file mode 100644
index 0000000000..1c29e35a9c
--- /dev/null
+++ b/tests/datakit/test_datakit.py
@@ -0,0 +1,76 @@
+# Copyright The Marin Authors
+# SPDX-License-Identifier: Apache-2.0
+
+"""Integration test for the datakit pipeline: download → normalize → tokenize, wired as StepSpecs."""
+
+from pathlib import Path
+
+import numpy as np
+import pyarrow.parquet as pq
+import pytest
+from levanter.store.cache import CacheLedger, TreeCache
+
+from marin.datakit.download import download_step
+from marin.datakit.normalize import content_hash_id, normalize_step
+from marin.datakit.tokenize import tokenize_step
+from marin.execution.step_runner import StepRunner
+
+
+@pytest.mark.slow
+def test_download_normalize_tokenize(tmp_path):
+    """Download → normalize → tokenize as a StepSpec DAG via StepRunner."""
+
+    dl = download_step(
+        "datakit/download",
+        hf_dataset_id="wikitext",
+        revision="main",
+        hf_urls_glob=["wikitext-2-v1/test-*.parquet"],
+        override_output_path=str(tmp_path / "raw"),
+    )
+
+    norm = normalize_step(
+        "datakit/normalize",
+        input_path=dl.output_path,
+        deps=[dl],
+        override_output_path=str(tmp_path / "normalized"),
+    )
+
+    tok = tokenize_step(
+        "datakit/tokenize",
+        input_path=norm.output_path,
+        tokenizer="gpt2",
+        deps=[norm],
+        override_output_path=str(tmp_path / "tokenized"),
+    )
+
+    StepRunner().run([dl, norm, tok])
+
+    # -- Verify download output --
+    raw_files = [f for f in Path(dl.output_path).rglob("*") if f.is_file() and not f.name.startswith(".")]
+    assert len(raw_files) >= 1
+
+    # -- Verify normalize output --
+    parquet_files = sorted(Path(norm.output_path).glob("*.parquet"))
+    assert len(parquet_files) >= 1
+
+    all_records = []
+    for pf in parquet_files:
+        records = pq.read_table(str(pf)).to_pylist()
+        all_records.extend(records)
+        ids = [r["id"] for r in records]
+        assert ids == sorted(ids), f"Partition {pf.name} not sorted by id"
+
+    assert len(all_records) > 0
+    for record in all_records:
+        assert record["id"] == content_hash_id(record["text"])
+
+    # -- Verify tokenize output --
+    train_dir = Path(tok.output_path) / "train"
+    ledger = CacheLedger.load(str(train_dir))
+    assert ledger.is_finished
+    assert ledger.total_num_rows > 0
+
+    exemplar = {"input_ids": np.array([0], dtype=np.int32)}
+    cache = TreeCache.load(str(train_dir), exemplar=exemplar)
+    assert len(cache) == ledger.total_num_rows
+    assert len(cache[0]["input_ids"]) > 0

From 3f78ea6fc21c2e652460e79c410e23a9dcf4bb7c Mon Sep 17 00:00:00 2001
From: Rafal Wojdyla <ravwojdyla@gmail.com>
Date: Wed, 25 Mar 2026 10:01:29 -0700
Subject: [PATCH 03/56] StepSpec: auto-prefix relative override_output_path
 with marin_prefix

When override_output_path is a relative path (no URL scheme, doesn't
start with /), StepSpec.output_path now automatically prepends
output_path_prefix or marin_prefix(). This matches the existing
Executor behavior and enables datasets to use short relative paths
like "raw/fineweb" in StepSpec definitions.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 lib/marin/src/marin/execution/step_spec.py | 20 +++++++++++++++++---
 1 file changed, 17 insertions(+), 3 deletions(-)

diff --git a/lib/marin/src/marin/execution/step_spec.py b/lib/marin/src/marin/execution/step_spec.py
index 76ef153aaa..bed4db725d 100644
--- a/lib/marin/src/marin/execution/step_spec.py
+++ b/lib/marin/src/marin/execution/step_spec.py
@@ -10,10 +10,18 @@
 from dataclasses import dataclass
 from functools import cached_property
 from typing import Any
+from urllib.parse import urlparse
 
 from iris.marin_fs import marin_prefix
 
 
+def _is_relative_path(url_or_path: str) -> bool:
+    """Return True if the path is relative (not a URL and doesn't start with /)."""
+    if urlparse(url_or_path).scheme:
+        return False
+    return not url_or_path.startswith("/")
+
+
 @dataclass(frozen=True)
 class _StepSpecMigrationConfig:
     """Temporary config used by ``StepSpec.as_executor_step()`` during the
@@ -86,11 +94,17 @@ def name_with_hash(self) -> str:
 
     @cached_property
     def output_path(self) -> str:
-        """Output path of the step"""
-        if self.override_output_path is not None:
-            return self.override_output_path
+        """Output path of the step.
 
+        If ``override_output_path`` is set and relative (no URL scheme, doesn't
+        start with ``/``), it is automatically prefixed with ``output_path_prefix``
+        or ``marin_prefix()``.
+        """
         prefix = self.output_path_prefix or marin_prefix()
+        if self.override_output_path is not None:
+            if _is_relative_path(self.override_output_path):
+                return f"{prefix}/{self.override_output_path}"
+            return self.override_output_path
         return f"{prefix}/{self.name_with_hash}"
 
     def as_executor_step(self) -> ExecutorStep:  # noqa: F821

From 4f84f71431401df1d7a2ba2cefcb55d5963bd234 Mon Sep 17 00:00:00 2001
From: Rafal Wojdyla <ravwojdyla@gmail.com>
Date: Wed, 25 Mar 2026 10:03:05 -0700
Subject: [PATCH 04/56] Convert datakit/download to package and move HF
 download modules

Converts the single datakit/download.py file into a
datakit/download/ package. Moves the HuggingFace download modules
(download_hf, stream_remove_columns, upload_gcs_to_hf) into
datakit/download/ as their canonical location. Adds download_hf_step()
as the StepSpec factory function.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .../src/marin/datakit/download/__init__.py    |  18 +
 .../src/marin/datakit/download/huggingface.py | 409 ++++++++++++++++++
 .../datakit/download/stream_remove_columns.py | 101 +++++
 .../datakit/download/upload_gcs_to_hf.py      | 364 ++++++++++++++++
 4 files changed, 892 insertions(+)
 create mode 100644 lib/marin/src/marin/datakit/download/__init__.py
 create mode 100644 lib/marin/src/marin/datakit/download/huggingface.py
 create mode 100644 lib/marin/src/marin/datakit/download/stream_remove_columns.py
 create mode 100644 lib/marin/src/marin/datakit/download/upload_gcs_to_hf.py

diff --git a/lib/marin/src/marin/datakit/download/__init__.py b/lib/marin/src/marin/datakit/download/__init__.py
new file mode 100644
index 0000000000..cc14fdbdf4
--- /dev/null
+++ b/lib/marin/src/marin/datakit/download/__init__.py
@@ -0,0 +1,18 @@
+# Copyright The Marin Authors
+# SPDX-License-Identifier: Apache-2.0
+
+from marin.datakit.download.huggingface import (
+    DownloadConfig,
+    download_hf,
+    download_hf_step,
+)
+
+# Backward-compat alias: download_step was the original name in the single-file module.
+download_step = download_hf_step
+
+__all__ = [
+    "DownloadConfig",
+    "download_hf",
+    "download_hf_step",
+    "download_step",
+]
diff --git a/lib/marin/src/marin/datakit/download/huggingface.py b/lib/marin/src/marin/datakit/download/huggingface.py
new file mode 100644
index 0000000000..6a6ff13cd2
--- /dev/null
+++ b/lib/marin/src/marin/datakit/download/huggingface.py
@@ -0,0 +1,409 @@
+#!/usr/bin/env python3
+# Copyright The Marin Authors
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+A script to download a HuggingFace dataset and upload it to a specified fsspec path,
+using HfFileSystem for direct streaming of data transfer.
+"""
+
+import logging
+import os
+import random
+import socket
+import time
+from dataclasses import dataclass, field
+
+import draccus
+import huggingface_hub
+from huggingface_hub import HfFileSystem
+from iris.marin_fs import open_url, url_to_fs
+from huggingface_hub.errors import HfHubHTTPError
+from packaging.version import Version
+from marin.execution.executor import THIS_OUTPUT_PATH
+from marin.execution.step_spec import StepSpec
+from marin.utilities.validation_utils import write_provenance_json
+from zephyr import Dataset, ZephyrContext
+from zephyr.writers import atomic_rename
+from iris.logging import configure_logging
+
+logger = logging.getLogger(__name__)
+
+HF_PROTOCOL_PREFIX = "hf://"
+HF_BUCKET_PATH_PREFIX = "buckets/"
+
+
+@dataclass(frozen=True)
+class DownloadConfig:
+    # fmt: off
+
+    # HuggingFace Dataset Parameters
+    hf_dataset_id: str                                      # HF Dataset to Download (as `$ORG/$DATASET` on HF Hub)
+
+    revision: str  # (Short) Commit Hash (from HF Dataset Repo; 7 characters)
+    hf_urls_glob: list[str] = field(default_factory=list)
+    # List of Glob Patterns to Match Files in HF Dataset, If empty we get all the files in a hf repo
+
+    gcs_output_path: str = THIS_OUTPUT_PATH
+    """
+    Path to store raw data in persistent storage (e.g. gs://$BUCKET/...).
+    This works with any fsspec-compatible path, but for backwards compatibility, we call it gcs_output_path.
+    """
+
+    append_sha_to_path: bool = False
+    """If true, write outputs under ``gcs_output_path/<revision>`` instead of directly under ``gcs_output_path``."""
+
+    # Job Control Parameters, used only for non-gated dataset transfers done via STS
+    wait_for_completion: bool = True                        # if True, will block until job completes
+
+    # fmt: on
+    hf_repo_type_prefix: str = (
+        "datasets"  # The repo_type_prefix is datasets/ for datasets,
+        # spaces/ for spaces, and models do not need a prefix in the URL.
+    )
+
+    zephyr_max_parallelism: int = 8
+    """Maximum parallelism of the Zephyr download job"""
+
+    read_timeout_seconds: float = 120.0
+    """Socket read timeout while streaming each HF file. Timeout failures trigger retries."""
+
+    progress_log_interval_seconds: float = 60.0
+    """Log a heartbeat for each in-flight shard every N seconds while bytes are flowing."""
+
+    read_chunk_size_mib: int = 8
+    """Chunk size for each streaming read from HF."""
+
+
+def _strip_hf_protocol(path: str) -> str:
+    return path.removeprefix(HF_PROTOCOL_PREFIX).lstrip("/")
+
+
+def _resolve_hf_source_path(cfg: DownloadConfig) -> str:
+    source_path = (
+        os.path.join(cfg.hf_repo_type_prefix, cfg.hf_dataset_id) if cfg.hf_repo_type_prefix else cfg.hf_dataset_id
+    )
+    return _strip_hf_protocol(source_path)
+
+
+def _assert_bucket_support_available(source_path: str) -> None:
+    if not source_path.startswith(HF_BUCKET_PATH_PREFIX):
+        return
+
+    if Version(huggingface_hub.__version__) < Version("1.6.0"):
+        raise RuntimeError(
+            f"Bucket paths require huggingface_hub>=1.6.0, found {huggingface_hub.__version__}. "
+            "Upgrade the runtime environment to a buckets-capable huggingface_hub version."
+        )
+
+
+def _relative_path_in_source(file_path: str, source_path: str) -> str:
+    normalized_file = _strip_hf_protocol(file_path)
+    normalized_source = _strip_hf_protocol(source_path).rstrip("/")
+
+    source_prefix = f"{normalized_source}/"
+    if normalized_file.startswith(source_prefix):
+        return normalized_file.removeprefix(source_prefix)
+
+    source_parts = [segment for segment in normalized_source.split("/") if segment]
+    file_parts = [segment for segment in normalized_file.split("/") if segment]
+
+    if len(file_parts) >= len(source_parts):
+        matches_source = True
+        for source_segment, file_segment in zip(source_parts, file_parts, strict=False):
+            if source_segment == file_segment:
+                continue
+            if file_segment.split("@", 1)[0] == source_segment:
+                continue
+            matches_source = False
+            break
+
+        if matches_source:
+            return "/".join(file_parts[len(source_parts) :])
+
+    # Backwards-compatible fallback for historical dataset path layout.
+    return normalized_file.split("/", 3)[-1]
+
+
+def ensure_fsspec_path_writable(output_path: str) -> None:
+    """Check if the fsspec path is writable by trying to create and delete a temporary file."""
+    fs, _ = url_to_fs(output_path)
+    try:
+        fs.mkdirs(output_path, exist_ok=True)
+        test_path = os.path.join(output_path, "test_write_access")
+        with fs.open(test_path, "w") as f:
+            f.write("test")
+        fs.rm(test_path)
+    except Exception as e:
+        raise ValueError(f"No write access to fsspec path: {output_path} ({e})") from e
+
+
+def stream_file_to_fsspec(
+    gcs_output_path: str,
+    file_path: str,
+    fsspec_file_path: str,
+    expected_size: int | None = None,
+    read_timeout_seconds: float = 120.0,
+    progress_log_interval_seconds: float = 60.0,
+    read_chunk_size_mib: int = 8,
+):
+    """Stream a file from HfFileSystem to another fsspec path using atomic write.
+
+    Uses atomic_rename to write to a temp file first, then rename on success.
+    This enables recovery across individual files if the job is interrupted.
+
+    Args:
+        gcs_output_path: Base output path for the download.
+        file_path: Source file path on HuggingFace.
+        fsspec_file_path: Target file path on the destination filesystem.
+        expected_size: Expected file size in bytes for validation. If provided,
+            the download will fail if the downloaded size doesn't match.
+    """
+    hf_fs = HfFileSystem(token=os.environ.get("HF_TOKEN", False))
+    target_fs, _ = url_to_fs(gcs_output_path)
+    chunk_size = max(1, int(read_chunk_size_mib)) * 1024 * 1024
+    max_retries = 20
+    # 15 minutes max sleep
+    max_sleep = 15 * 60
+    # Minimum base wait time to avoid too-fast retries
+    min_base_wait = 5
+
+    # Retry when there is an error, such as hf rate limit
+    last_exception = None
+    for attempt in range(max_retries):
+        try:
+            target_fs.mkdirs(os.path.dirname(fsspec_file_path), exist_ok=True)
+            bytes_written = 0
+            with atomic_rename(fsspec_file_path) as temp_path:
+                previous_socket_timeout = socket.getdefaulttimeout()
+                socket.setdefaulttimeout(read_timeout_seconds)
+                try:
+                    with (
+                        hf_fs.open(file_path, "rb", block_size=chunk_size) as src_file,
+                        open_url(temp_path, "wb") as dest_file,
+                    ):
+                        start_time = time.monotonic()
+                        next_progress_log = start_time + progress_log_interval_seconds
+                        while True:
+                            try:
+                                chunk = src_file.read(chunk_size)
+                            except TimeoutError as timeout_error:
+                                raise TimeoutError(
+                                    f"Timed out reading from {file_path} after "
+                                    f"{read_timeout_seconds:.1f}s with {bytes_written} bytes written"
+                                ) from timeout_error
+                            if not chunk:
+                                break
+                            dest_file.write(chunk)
+                            bytes_written += len(chunk)
+                            now = time.monotonic()
+                            if progress_log_interval_seconds > 0 and now >= next_progress_log:
+                                elapsed = max(now - start_time, 1e-9)
+                                speed_mib_s = (bytes_written / (1024**2)) / elapsed
+                                logger.info(
+                                    f"Streaming {file_path}: {bytes_written / (1024**2):.1f} MiB written "
+                                    f"in {elapsed:.1f}s ({speed_mib_s:.2f} MiB/s)"
+                                )
+                                next_progress_log = now + progress_log_interval_seconds
+                finally:
+                    socket.setdefaulttimeout(previous_socket_timeout)
+
+                # Validate file size BEFORE atomic_rename commits the file
+                if expected_size is not None and bytes_written != expected_size:
+                    raise ValueError(
+                        f"Size mismatch for {file_path}: expected {expected_size} bytes, got {bytes_written} bytes"
+                    )
+
+            logger.info(f"Streamed {file_path} successfully to {fsspec_file_path} ({bytes_written} bytes)")
+            return {"file_path": file_path, "status": "success", "size": bytes_written}
+        except Exception as e:
+            last_exception = e
+            # Base wait: min 5s, then exponential: 5, 10, 20, 40, 80, 160, 320, 600 (capped)
+            wait_base = max(min_base_wait, min_base_wait * (2**attempt))
+
+            error_type = type(e).__name__
+            error_msg = str(e)
+            status_code = -1
+
+            if isinstance(e, HfHubHTTPError):
+                status_code = e.response.status_code
+                TOO_MANY_REQUESTS = 429
+                if status_code == TOO_MANY_REQUESTS:
+                    # NOTE: RateLimit "api\|pages\|resolvers";r=[remaining];t=[seconds remaining until reset]
+                    try:
+                        rate_limit_wait = int(e.response.headers["RateLimit"].split(";")[-1].split("=")[-1])
+                        wait_base = max(wait_base, rate_limit_wait + 10)  # Add buffer to rate limit wait
+                    except Exception:
+                        logger.warning("Failed to parse rate limit header, using default wait period")
+
+            logger.warning(
+                f"Attempt {attempt + 1}/{max_retries} failed for {file_path}: "
+                f"{error_type} (status={status_code}): {error_msg}"
+            )
+
+            jitter = random.uniform(0, min(wait_base * 0.25, 30))  # Up to 25% jitter, max 30s
+            wait_time = min(wait_base + jitter, max_sleep)
+
+            logger.info(f"Retrying {file_path} in {wait_time:.1f}s...")
+            time.sleep(wait_time)
+
+    raise RuntimeError(
+        f"Failed to download {file_path} after {max_retries} attempts. "
+        f"Last error: {type(last_exception).__name__}: {last_exception}"
+    )
+
+
+def download_hf(cfg: DownloadConfig) -> None:
+
+    configure_logging(level=logging.INFO)
+
+    # Set cfg.append_sha_to_path=True to mimic the older behavior of writing to gcs_output_path/<revision>.
+    # Some historical datasets were written that way, so this flag keeps backwards compatibility when needed.
+
+    # Ensure the output path is writable
+    try:
+        output_path = os.path.join(cfg.gcs_output_path, cfg.revision) if cfg.append_sha_to_path else cfg.gcs_output_path
+        ensure_fsspec_path_writable(output_path)
+    except ValueError as e:
+        logger.exception(f"Output path validation failed: {e}")
+        raise e
+
+    # Initialize Hugging Face filesystem
+    logger.info("Identifying files to download from HuggingFace...")
+    hf_fs = HfFileSystem(token=os.environ.get("HF_TOKEN", False))
+    hf_source_path = _resolve_hf_source_path(cfg)
+    _assert_bucket_support_available(hf_source_path)
+
+    if not cfg.hf_urls_glob:
+        # We get all the files using find
+        files = hf_fs.find(hf_source_path, revision=cfg.revision)
+    else:
+        # Get list of files directly from HfFileSystem matching the pattern
+        files = []
+        for hf_url_glob in cfg.hf_urls_glob:
+            pattern = os.path.join(hf_source_path, hf_url_glob)
+            files += hf_fs.glob(pattern, revision=cfg.revision)
+
+    if not files:
+        raise ValueError(f"No files found for dataset `{cfg.hf_dataset_id}. Used glob patterns: {cfg.hf_urls_glob}")
+
+    # Get file sizes for validation
+    logger.info("Getting file sizes for validation...")
+    file_sizes: dict[str, int | None] = {}
+    for file in files:
+        try:
+            info = hf_fs.info(file, revision=cfg.revision)
+            file_sizes[file] = info.get("size") or None
+        except Exception as e:
+            logger.warning(f"Could not get size for {file}: {e}")
+            file_sizes[file] = None  # Will skip validation for this file
+
+    download_tasks = []
+
+    for file in files:
+        try:
+            relative_file_path = _relative_path_in_source(file, hf_source_path)
+            if relative_file_path.startswith(".."):
+                raise ValueError(f"Computed path escapes source root: source={hf_source_path}, file={file}")
+            fsspec_file_path = os.path.join(output_path, relative_file_path)
+            expected_size = file_sizes.get(file)
+            download_tasks.append(
+                (
+                    output_path,
+                    file,
+                    fsspec_file_path,
+                    expected_size,
+                    cfg.read_timeout_seconds,
+                    cfg.progress_log_interval_seconds,
+                    cfg.read_chunk_size_mib,
+                )
+            )
+        except Exception as e:
+            logging.exception(f"Error preparing task for {file}: {e}")
+
+    total_files = len(download_tasks)
+    total_size_gb = sum(s for s in file_sizes.values() if s is not None) / (1024**3)
+    logger.info(f"Total number of files to process: {total_files} ({total_size_gb:.2f} GB)")
+
+    pipeline = (
+        Dataset.from_list(download_tasks)
+        .map(lambda task: stream_file_to_fsspec(*task))
+        .write_jsonl(
+            f"{cfg.gcs_output_path}/.metrics/success-part-{{shard:05d}}-of-{{total:05d}}.jsonl", skip_existing=True
+        )
+    )
+    ctx = ZephyrContext(name="download-hf", max_workers=cfg.zephyr_max_parallelism)
+    ctx.execute(pipeline)
+
+    # Write Provenance JSON
+    write_provenance_json(
+        output_path,
+        metadata={"dataset": cfg.hf_dataset_id, "version": cfg.revision, "links": files},
+    )
+
+    logger.info(f"Streamed all files and wrote provenance JSON; check {output_path}.")
+
+
+def download_hf_step(
+    name: str,
+    *,
+    hf_dataset_id: str,
+    revision: str,
+    hf_urls_glob: list[str] | None = None,
+    zephyr_max_parallelism: int = 8,
+    deps: list[StepSpec] | None = None,
+    output_path_prefix: str | None = None,
+    override_output_path: str | None = None,
+) -> StepSpec:
+    """Create a StepSpec that downloads a HuggingFace dataset.
+
+    The raw download is preserved as-is in its original format and directory structure.
+
+    Args:
+        name: Step name (e.g. "raw/fineweb").
+        hf_dataset_id: HuggingFace dataset identifier (e.g. "HuggingFaceFW/fineweb").
+        revision: Commit hash from the HF dataset repo.
+        hf_urls_glob: Glob patterns to select specific files. Empty means all files.
+        zephyr_max_parallelism: Maximum download parallelism.
+        deps: Optional upstream dependencies.
+        output_path_prefix: Override the default output path prefix.
+        override_output_path: Override the computed output path entirely.
+
+    Returns:
+        A StepSpec whose output_path contains the raw downloaded files.
+    """
+    resolved_glob = hf_urls_glob or []
+
+    def _run(output_path: str) -> None:
+        download_hf(
+            DownloadConfig(
+                hf_dataset_id=hf_dataset_id,
+                revision=revision,
+                hf_urls_glob=resolved_glob,
+                gcs_output_path=output_path,
+                zephyr_max_parallelism=zephyr_max_parallelism,
+            )
+        )
+
+    return StepSpec(
+        name=name,
+        fn=_run,
+        deps=deps or [],
+        hash_attrs={
+            "hf_dataset_id": hf_dataset_id,
+            "revision": revision,
+            "hf_urls_glob": resolved_glob,
+        },
+        output_path_prefix=output_path_prefix,
+        override_output_path=override_output_path,
+    )
+
+
+@draccus.wrap()
+def main(cfg: DownloadConfig) -> None:
+    """Download HuggingFace dataset."""
+    download_hf(cfg)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/lib/marin/src/marin/datakit/download/stream_remove_columns.py b/lib/marin/src/marin/datakit/download/stream_remove_columns.py
new file mode 100644
index 0000000000..b16e3a1f1b
--- /dev/null
+++ b/lib/marin/src/marin/datakit/download/stream_remove_columns.py
@@ -0,0 +1,101 @@
+# Copyright The Marin Authors
+# SPDX-License-Identifier: Apache-2.0
+
+"""Remove unnecessary columns while streaming data from huggingface."""
+
+import logging
+import os
+from dataclasses import dataclass
+
+import pandas as pd
+import pyarrow.parquet as pq
+from huggingface_hub import HfFileSystem
+from tqdm import tqdm
+from zephyr import Dataset, ZephyrContext
+
+hf_fs = HfFileSystem()
+logger = logging.getLogger(__name__)
+
+
+def prune_stream_and_save(input_file: str, output_file: str, keep_columns: list[str]):
+    """
+    Prunes and saves a parquet file by removing un-specified columns.
+
+    Reads the input parquet file in batches, removes columns not in keep_columns,
+    and writes the result to output_file. Processing in batches avoids memory issues.
+
+    Args:
+        input_file (str): Path to input parquet file on HuggingFace
+        output_file (str): Path where pruned parquet file will be saved
+        keep_columns (list[str]): List of column names to retain
+    """
+    parquet_file = pq.ParquetFile(hf_fs.open(input_file))
+
+    full_df_list = []
+    for batch in tqdm(parquet_file.iter_batches(batch_size=10000), desc=f"Processing {input_file}"):
+        df = batch.to_pandas()
+
+        drop_columns = [col for col in df.columns if col not in keep_columns]
+        df = df.drop(columns=drop_columns)
+
+        full_df_list.append(df)
+
+    full_df = pd.concat(full_df_list)
+    logger.info(f"Saving pruned dataset of shape {full_df.shape} to {output_file}")
+    full_df.to_parquet(output_file, index=False)
+
+
+def get_file_tasks(hf_path: str, output_path: str, keep_columns: list[str]):
+    """
+    Generate file processing tasks for a HuggingFace subset.
+
+    Args:
+        hf_path (str): The HuggingFace dataset path to load
+        output_path (str): The output path to save the pruned dataset
+        keep_columns (list[str]): The columns to keep in the pruned dataset
+
+    Yields:
+        Dict with input_file, output_file, and keep_columns for each parquet file
+    """
+    logger.info(f"Loading dataset from {hf_path}")
+    parquet_list = hf_fs.glob(f"{hf_path}/*.parquet")
+
+    for file in parquet_list:
+        output_file = os.path.join(output_path, os.path.basename(file))
+        yield {"input_file": file, "output_file": output_file, "keep_columns": keep_columns}
+
+
+@dataclass
+class DatasetConfig:
+    hf_repo_id: str
+    hf_revision: str
+    hf_paths: list[str]
+    output_path: str
+    keep_columns: list[str]
+
+
+def prune_hf_dataset(cfg: DatasetConfig):
+    logger.info(f"Starting dataset pruning for {cfg.hf_paths}")
+
+    # Build list of subset paths to process
+    subset_tasks = []
+    for path in cfg.hf_paths:
+        # HF Path form: hf://[<repo_type_prefix>]<repo_id>[@<revision>]/<path/in/repo>
+        hf_path = f"hf://datasets/{cfg.hf_repo_id}@{cfg.hf_revision}/{path}"
+        logger.info(f"Processing subset {hf_path}")
+        output_path = os.path.join(cfg.output_path, path)
+        subset_tasks.append({"hf_path": hf_path, "output_path": output_path})
+
+    # Build pipeline with nested parallelism:
+    # - Outer level: process subsets (MAX_CONCURRENT_WORKERS=1)
+    # - Inner level: process files within each subset
+    pipeline = (
+        Dataset.from_list(subset_tasks)
+        .flat_map(lambda task: get_file_tasks(task["hf_path"], task["output_path"], cfg.keep_columns))
+        .map(lambda task: prune_stream_and_save(task["input_file"], task["output_file"], cfg.keep_columns))
+    )
+
+    logger.info("Executing pipeline")
+    ctx = ZephyrContext(name="hf-remove-columns")
+    ctx.execute(pipeline)
+    logger.info("Successfully processed all subsets")
diff --git a/lib/marin/src/marin/datakit/download/upload_gcs_to_hf.py b/lib/marin/src/marin/datakit/download/upload_gcs_to_hf.py
new file mode 100644
index 0000000000..1aa580c618
--- /dev/null
+++ b/lib/marin/src/marin/datakit/download/upload_gcs_to_hf.py
@@ -0,0 +1,364 @@
+# Copyright The Marin Authors
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+Upload GCS to Hugging Face (HF) Script
+
+This script transfers model checkpoints or other content from Google Cloud Storage (GCS)
+to Hugging Face repositories. It handles:
+- Finding checkpoint directories in GCS buckets
+- Downloading the content locally (to a temporary directory)
+- Uploading to a specified Hugging Face repository with appropriate versioning
+- Supporting dry-run mode to preview what would be uploaded
+
+Usage as a script:
+  python upload_gcs_to_hf.py --repo-id="organization/model-name" [--dry-run] [--directory="gs://bucket/path"]
+
+Usage as an ExecutorStep:
+  upload_step = ExecutorStep(
+      name="upload_model_to_hf",
+      fn=upload_gcs_to_hf,
+      config=UploadConfig(
+          hf_repo_id="organization/model-name",
+          gcs_directories=["gs://bucket/path/to/model"],
+          dry_run=False
+      )
+  )
+"""
+
+import argparse
+import logging
+import os
+import re
+import subprocess
+import tempfile
+from dataclasses import dataclass, field
+
+from google.cloud import storage
+from google.cloud.storage import transfer_manager
+from huggingface_hub import HfApi, create_repo
+from iris.logging import configure_logging
+
+# Set up logging
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class UploadConfig:
+    """Configuration for uploading from GCS to Hugging Face."""
+
+    hf_repo_id: str
+    gcs_directories: list[str] = field(default_factory=list)
+    dry_run: bool = False
+    wait_for_completion: bool = True  # Added for compatibility with other configs
+
+
+# Default GCS directories to check if none specified
+DEFAULT_GCS_DIRS = [
+    "gs://marin-eu-west4/checkpoints/llama-8b-tootsie-0.001-19ad63/hf/",
+    "gs://marin-us-central2/checkpoints/llama-8b-tootsie-phase2/hf/",
+    "gs://marin-us-central2/checkpoints/llama-8b-tootsie-phase3/hf/",
+    "gs://marin-us-central2/checkpoints/tootsie-8b-soft-raccoon-3/hf/",
+    "gs://marin-us-central2/checkpoints/llama-8b-tootsie-adept-phoenix/hf/",
+    "gs://marin-us-central2/checkpoints/tootsie-8b-sensible-starling/hf/",
+    "gs://marin-us-central1/checkpoints/tootsie-8b-deeper-starling/hf/",
+]
+
+
+def list_gcs_directories(gcs_path: str) -> list[tuple[str, int]]:
+    """List subdirectories by examining full blob paths."""
+    if not gcs_path.startswith("gs://"):
+        raise ValueError(f"Invalid GCS path: {gcs_path}")
+
+    path = gcs_path[5:]  # Remove "gs://"
+    bucket_name = path.split("/")[0]
+    prefix = "/".join(path.split("/")[1:])
+
+    logger.info(f"Checking: {gcs_path}")
+
+    # Get the bucket
+    client = storage.Client()
+    bucket = client.bucket(bucket_name)
+
+    # List blobs with this prefix (without delimiter to get all)
+    blobs = bucket.list_blobs(prefix=prefix)
+
+    # Extract potential directories from blob paths
+    directories = set()
+    step_pattern = re.compile(r"step-\d+")
+
+    for blob in blobs:
+        # Remove the prefix to get the relative path
+        relative_path = blob.name[len(prefix) :]
+
+        # Skip if there's no relative path
+        if not relative_path:
+            continue
+
+        # Extract the first directory level
+        parts = relative_path.strip("/").split("/")
+        if parts:
+            first_dir = parts[0]
+
+            # Check if it's a step directory
+            if step_pattern.match(first_dir):
+                directories.add(first_dir)
+
+    # Process the directories we found
+    step_dirs_local = []
+    for dir_name in directories:
+        if step_pattern.match(dir_name):
+            try:
+                step_number = int(dir_name.split("-")[1])
+                full_path = f"{gcs_path}{dir_name}/"
+                step_dirs_local.append((full_path, step_number))
+                logger.info(f"Found step directory: {full_path} with step {step_number}")
+            except (IndexError, ValueError) as e:
+                logger.error(f"Error parsing step number from {dir_name}: {e}")
+
+    logger.info(f"Found {len(step_dirs_local)} step directories in {gcs_path}")
+    return step_dirs_local
+
+
+def download_from_gcs(gcs_path: str, local_path: str) -> bool:
+    """Download contents from a GCS path to a local directory using the GCS transfer manager."""
+    logger.info(f"Downloading {gcs_path} to {local_path}...")
+
+    # Parse the GCS path (format: gs://bucket-name/path/to/files)
+    if not gcs_path.startswith("gs://"):
+        logger.error(f"Invalid GCS path format: {gcs_path}")
+        return False
+
+    bucket_name = gcs_path[5:].split("/")[0]
+    prefix = "/".join(gcs_path[5:].split("/")[1:])
+
+    # Handle wildcard at the end (the original had f"{gcs_path}*")
+    if prefix.endswith("*"):
+        prefix = prefix[:-1]
+
+    # Initialize the GCS client
+    client = storage.Client()
+    bucket = client.bucket(bucket_name)
+
+    # List all matching blobs
+    blobs = list(bucket.list_blobs(prefix=prefix))
+
+    if not blobs:
+        logger.error(f"No files found in {gcs_path}")
+        return False
+
+    total_files = len(blobs)
+    logger.info(f"Found {total_files} files to download from {gcs_path}")
+
+    # Get the blob names to download (excluding directory placeholders)
+    blob_names = []
+    for blob in blobs:
+        if not blob.name.endswith("/"):
+            blob_names.append(blob.name)
+
+    if len(blob_names) < total_files:
+        logger.info(f"Filtered out {total_files - len(blob_names)} directory markers")
+
+    # Ensure local directory exists
+    os.makedirs(local_path, exist_ok=True)
+
+    # Log the first few blob names to debug issues
+    if blob_names:
+        logger.info(f"Sample blob names (first 3): {', '.join(blob_names[:3])}")
+
+    # Use transfer manager to download all blobs in parallel
+    logger.info(f"Starting parallel download of {len(blob_names)} files...")
+
+    transfer_manager.download_many_to_path(
+        bucket=bucket,
+        blob_names=blob_names,
+        destination_directory=local_path,
+        max_workers=8,
+        create_directories=True,
+        worker_type="process",
+        raise_exception=True,
+    )
+
+    logger.info(f"Download completed successfully. Downloaded {len(blob_names)} files.")
+    return True
+
+
+def checkpoint_exists(repo_id: str, step: int, version_name: str) -> bool:
+    """Check if a specific revision exists in a Hugging Face repository."""
+    try:
+        api = HfApi()
+        commits = api.list_repo_commits(repo_id=repo_id)
+        for commit in commits:
+            if f"step {step}" in commit.title:
+                return True
+        return False
+    except Exception:
+        return False
+
+
+def extract_version_from_path(gcs_path: str) -> str:
+    """Extract the version name from a GCS path."""
+    # Extract model name from path like "gs://marin-eu-west4/checkpoints/llama-8b-tootsie-0.001-19ad63/hf/"
+    parts = gcs_path.strip("/").split("/")
+    return parts[-3]
+
+
+def upload_to_huggingface(local_path: str, repo_id: str, step: int, version_name: str) -> bool:
+    """Upload a local directory to Hugging Face as a specific revision."""
+    logger.info(f"Uploading checkpoint {version_name}, step {step} to Hugging Face")
+
+    # Check if repo exists, create if not
+    api = HfApi()
+    create_repo(repo_id=repo_id, exist_ok=True)
+    # Upload the directory
+    result = api.upload_folder(
+        folder_path=local_path,
+        repo_id=repo_id,
+        commit_message=f"Upload checkpoint for step {step} ({version_name})",
+    )
+    try:
+        api.delete_tag(repo_id=repo_id, tag=version_name)
+    except Exception:
+        logger.info("Creating tag for the first time")
+    api.create_tag(repo_id=repo_id, tag=version_name)
+    logger.info("Upload completed successfully.")
+    logger.info(f"Commit URL: {result.commit_url}")
+    return True
+
+
+def upload_gcs_to_hf(cfg: UploadConfig) -> None:
+    """Main function to upload model checkpoints from GCS to Hugging Face."""
+
+    configure_logging(level=logging.INFO)
+
+    # Collect all step directories
+    all_step_dirs = []
+
+    # Determine which directories to process
+    directories_to_process = cfg.gcs_directories if cfg.gcs_directories else DEFAULT_GCS_DIRS
+
+    # Process each directory
+    for directory in directories_to_process:
+        try:
+            step_dirs = list_gcs_directories(directory)
+            all_step_dirs.extend(step_dirs)
+        except Exception as e:
+            logger.error(f"Error listing {directory}: {e}")
+
+    # Sort all step directories by step number
+    if all_step_dirs:
+        all_step_dirs.sort(key=lambda x: x[1])
+
+        # Print sorted step directories
+        logger.info("\nAll step directories sorted by step number:")
+        logger.info("-" * 50)
+        for full_path, _step_number in all_step_dirs:
+            logger.info(f"- {full_path}")
+
+        logger.info(f"\nTotal: {len(all_step_dirs)} step directories")
+
+        # Upload to Hugging Face
+        if not cfg.dry_run:
+            logger.info(f"\nUploading to Hugging Face repo: {cfg.hf_repo_id}")
+
+            for full_path, step_number in all_step_dirs:
+                # Extract version name from the path
+                version_name = extract_version_from_path(full_path)
+
+                # Check if this checkpoint already exists
+                if checkpoint_exists(cfg.hf_repo_id, step_number, version_name):
+                    logger.info(
+                        f"Step {step_number} for {version_name} already exists in HF repo {cfg.hf_repo_id}, skipping"
+                    )
+                    continue
+
+                # Create a temporary directory for downloading
+                with tempfile.TemporaryDirectory() as temp_dir:
+                    logger.info(f"\nProcessing step {step_number} from {full_path} ({version_name})")
+
+                    # Download from GCS
+                    if download_from_gcs(full_path, temp_dir):
+                        # Upload to HF
+                        if upload_to_huggingface(temp_dir, cfg.hf_repo_id, step_number, version_name):
+                            logger.info(
+                                f"Successfully uploaded step {step_number} ({version_name}) to HF repo {cfg.hf_repo_id}"
+                            )
+                        else:
+                            logger.error(f"Failed to upload step {step_number}")
+                    else:
+                        logger.error(f"Failed to download step {step_number}")
+
+            logger.info("\nUpload process completed.")
+        else:
+            logger.info("\nDry run - showing what would be uploaded:")
+            logger.info("-" * 50)
+
+            for i, (full_path, step_number) in enumerate(all_step_dirs):
+                version_name = extract_version_from_path(full_path)
+                logger.info(f"\nCheckpoint {i + 1}/{len(all_step_dirs)}:")
+                logger.info(f"  Source: {full_path}")
+                logger.info(f"  Target repo: {cfg.hf_repo_id}")
+                logger.info(f"  Revision: {version_name}")
+                logger.info(f"  Commit message: Upload checkpoint for step {step_number} ({version_name})")
+
+                # Try to estimate what files would be uploaded
+                try:
+                    # Use gsutil to list files in the directory
+                    cmd = ["gsutil", "ls", f"{full_path}"]
+                    result = subprocess.run(cmd, capture_output=True, text=True)
+                    if result.returncode == 0:
+                        files = result.stdout.strip().split("\n")
+                        # Filter out empty strings and limit to 5 for display
+                        files = [f for f in files if f]
+
+                        if files:
+                            logger.info(
+                                f"  Example files that would be uploaded ({min(len(files), 5)} of {len(files)}):"
+                            )
+                            for file in files[:5]:
+                                logger.info(f"    - {os.path.basename(file)}")
+                            if len(files) > 5:
+                                logger.info(f"    - ... and {len(files) - 5} more")
+                except Exception as e:
+                    logger.error(f"  Could not list files: {e}")
+
+            logger.info("\nDry run completed - no actual uploads performed.")
+    else:
+        logger.warning("\nNo step directories found in any of the paths.")
+        logger.warning("You might want to check if:")
+        logger.warning("1. The paths are correct")
+        logger.warning("2. You have permissions to access these buckets")
+        logger.warning("3. There are step directories in these locations")
+
+
+def main():
+    """Command line entry point for direct script usage."""
+    parser = argparse.ArgumentParser(description="Upload checkpoints from GCS to Hugging Face")
+    parser.add_argument(
+        "--repo-id", required=True, help='Target Hugging Face repository ID (e.g., "username/model-name")'
+    )
+    parser.add_argument("--dry-run", action="store_true", help="Only list checkpoints without uploading")
+    parser.add_argument(
+        "--directories",
+        nargs="+",
+        help="Process specific GCS directories instead of the built-in list. Multiple directories can be provided.",
+    )
+    args = parser.parse_args()
+
+    # Create config from args
+    config = UploadConfig(
+        hf_repo_id=args.repo_id, gcs_directories=args.directories if args.directories else [], dry_run=args.dry_run
+    )
+
+    # Check if application default credentials are set
+    if "GOOGLE_APPLICATION_CREDENTIALS" not in os.environ:
+        logger.warning("Warning: GOOGLE_APPLICATION_CREDENTIALS environment variable not set.")
+        logger.warning("Make sure you're authenticated with Google Cloud before running this script.")
+        logger.warning("You can authenticate using: gcloud auth application-default login")
+
+    # Run the upload function
+    upload_gcs_to_hf(config)
+
+
+if __name__ == "__main__":
+    main()

From 0a1413a628c1a3c9394e6893fe5db705f1db0a70 Mon Sep 17 00:00:00 2001
From: Rafal Wojdyla <ravwojdyla@gmail.com>
Date: Wed, 25 Mar 2026 10:08:58 -0700
Subject: [PATCH 05/56] Move specialized downloaders to datakit/download/ with
 StepSpec factories

Moves nemotron_cc, uncheatable_eval, ar5iv, dclm_hq, wikipedia, and
filesystem modules into datakit/download/. Each module gains a *_step()
factory returning StepSpec. Renames ambiguous DownloadConfig classes to
Ar5ivDownloadConfig and WikipediaDownloadConfig. The uncheatable_eval
make_uncheatable_eval_step() is preserved as a compat wrapper around
the new uncheatable_eval_step().

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 lib/marin/src/marin/datakit/download/ar5iv.py | 160 +++++++
 .../src/marin/datakit/download/dclm_hq.py     | 232 ++++++++++
 .../src/marin/datakit/download/filesystem.py  | 101 ++++
 .../src/marin/datakit/download/nemotron_cc.py | 142 ++++++
 .../datakit/download/uncheatable_eval.py      | 438 ++++++++++++++++++
 .../src/marin/datakit/download/wikipedia.py   | 150 ++++++
 6 files changed, 1223 insertions(+)
 create mode 100644 lib/marin/src/marin/datakit/download/ar5iv.py
 create mode 100644 lib/marin/src/marin/datakit/download/dclm_hq.py
 create mode 100644 lib/marin/src/marin/datakit/download/filesystem.py
 create mode 100644 lib/marin/src/marin/datakit/download/nemotron_cc.py
 create mode 100644 lib/marin/src/marin/datakit/download/uncheatable_eval.py
 create mode 100644 lib/marin/src/marin/datakit/download/wikipedia.py

diff --git a/lib/marin/src/marin/datakit/download/ar5iv.py b/lib/marin/src/marin/datakit/download/ar5iv.py
new file mode 100644
index 0000000000..86498e12e1
--- /dev/null
+++ b/lib/marin/src/marin/datakit/download/ar5iv.py
@@ -0,0 +1,160 @@
+# Copyright The Marin Authors
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+Download and process Ar5iv dataset from a zip file.
+
+Example Usage:
+uv run zephyr --backend=ray --max-parallelism=1000 --memory=10GB \
+    lib/marin/src/marin/download/ar5iv/download.py \
+    --input_path gs://bucket/ar5iv.zip \
+    --output_path gs://bucket/output
+"""
+
+import json
+import logging
+import zipfile
+from collections import defaultdict
+from dataclasses import dataclass
+
+import draccus
+from iris.marin_fs import open_url
+from marin.execution.step_spec import StepSpec
+from zephyr import Dataset, ZephyrContext
+from zephyr.writers import atomic_rename
+from iris.logging import configure_logging
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class Ar5ivDownloadConfig:
+    input_path: str
+    output_path: str
+    max_files: int | None = None  # Maximum number of shards to process
+
+
+def process_shard(shard_task: dict) -> dict:
+    """
+    Process a single shard by extracting its files from the zip in GCS and uploading the merged JSONL.
+
+    Args:
+        shard_task: Dict with keys 'input_path', 'output_path', 'shard_id', 'file_list'
+    """
+    input_path = shard_task["input_path"]
+    output_path = shard_task["output_path"]
+    shard_id = shard_task["shard_id"]
+    file_list = shard_task["file_list"]
+    gcs_path = f"{output_path}/{shard_id}.jsonl.gz"
+
+    with open_url(str(input_path), "rb") as f:
+        with zipfile.ZipFile(f) as zf:
+            with atomic_rename(gcs_path) as temp_path, open_url(temp_path, "wt", compression="gzip") as out_f:
+                for filename in file_list:
+                    with zf.open(filename, "r") as file_handle:
+                        content = file_handle.read()
+                        record = {
+                            "filename": filename,
+                            "format": "html",
+                            "content": content.decode("utf-8", errors="replace"),
+                        }
+                        print(json.dumps(record), file=out_f)
+
+            logger.info(f"Shard {shard_id} with {len(file_list)} files uploaded to {gcs_path}")
+            return {"shard_id": shard_id, "num_files": len(file_list), "output_path": gcs_path}
+
+
+def download(cfg: Ar5ivDownloadConfig) -> None:
+    """
+    Download and process Ar5iv dataset from a zip file in GCS.
+
+    This function can be called by the executor framework or used standalone.
+    """
+    logger.info("Starting transfer of Ar5iv dataset...")
+    logger.info(f"Source: {cfg.input_path}")
+
+    # Use fsspec+zipfile to list all files
+    with open_url(str(cfg.input_path), "rb") as f:
+        with zipfile.ZipFile(f) as zf:
+            all_files = zf.infolist()
+
+            # Group by shard directory
+            # We assume structure: something like: shard_id/.../file
+            # shard_id is derived from the second last component if files are nested.
+            # Adjust as needed if directory structure differs.
+            shard_dict = defaultdict(list)
+            for info in all_files:
+                if info.is_dir():
+                    continue
+                # E.g. path might look like: "003/something.html"
+                # Extract shard_id from the directory:
+                # Split by "/" and take the first part if we assume structure {shard_id}/file
+                parts = info.filename.strip("/").split("/")
+                if len(parts) < 2:
+                    # File at root level - decide how to handle this case.
+                    # If no directory structure is given, skip or treat differently.
+                    continue
+                shard_id = parts[-2]  # get the second-last directory as shard_id
+                shard_dict[shard_id].append(info.filename)
+
+            # Apply max_files limit if provided
+            shard_ids = list(shard_dict.keys())
+            if cfg.max_files is not None:
+                shard_ids = shard_ids[: cfg.max_files]
+
+            logger.info(f"Found {len(shard_ids)} shards to process.")
+
+            # Build task list for each shard
+            shard_tasks = []
+            for shard_id in shard_ids:
+                shard_tasks.append(
+                    {
+                        "input_path": cfg.input_path,
+                        "output_path": cfg.output_path,
+                        "shard_id": shard_id,
+                        "file_list": shard_dict[shard_id],
+                    }
+                )
+
+    # Execute pipeline with zephyr
+    pipeline = (
+        Dataset.from_list(shard_tasks)
+        .map(process_shard)
+        .write_jsonl(f"{cfg.output_path}/.metrics/part-{{shard:05d}}.jsonl", skip_existing=True)
+    )
+    ctx = ZephyrContext(name="download-ar5iv")
+    ctx.execute(pipeline)
+
+    logger.info("Transfer completed successfully!")
+
+
+def ar5iv_step(
+    name: str = "raw/ar5iv",
+    *,
+    input_path: str,
+    max_files: int | None = None,
+    deps: list[StepSpec] | None = None,
+    output_path_prefix: str | None = None,
+    override_output_path: str | None = None,
+) -> StepSpec:
+    """Create a StepSpec that downloads and processes the Ar5iv dataset from a zip file."""
+
+    def _run(output_path: str) -> None:
+        download(Ar5ivDownloadConfig(input_path=input_path, output_path=output_path, max_files=max_files))
+
+    return StepSpec(
+        name=name,
+        fn=_run,
+        deps=deps or [],
+        hash_attrs={"input_path": input_path, "max_files": max_files},
+        output_path_prefix=output_path_prefix,
+        override_output_path=override_output_path,
+    )
+
+
+@draccus.wrap()
+def main(cfg: Ar5ivDownloadConfig) -> None:
+    """CLI entrypoint for downloading and processing Ar5iv dataset."""
+
+    configure_logging(level=logging.INFO)
+    download(cfg)
diff --git a/lib/marin/src/marin/datakit/download/dclm_hq.py b/lib/marin/src/marin/datakit/download/dclm_hq.py
new file mode 100644
index 0000000000..83c127c079
--- /dev/null
+++ b/lib/marin/src/marin/datakit/download/dclm_hq.py
@@ -0,0 +1,232 @@
+# Copyright The Marin Authors
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+Download DCLM HQ HTML data by fetching HTML content from Common Crawl.
+
+Processes DCLM HQ JSONL files and enriches them with HTML content fetched from Common Crawl
+via a custom index server. Uses zephyr for parallel processing with flattened parallelism.
+
+Example Usage:
+uv run zephyr --backend=ray --max-parallelism=800 --memory=2GB \
+    lib/marin/src/marin/download/dclm_hq/download_dclm_hq_html.py \
+    --input_path gs://marin-us-central2/raw/dclm-baseline-1.0-parquet/global/ \
+    --output_path gs://marin-data/processed/dclm-hq-html/
+"""
+
+import io
+import json
+import logging
+import os
+import re
+from dataclasses import dataclass
+
+import requests
+from iris.marin_fs import open_url
+from marin.execution.step_spec import StepSpec
+import warcio
+from marin.utils import fsspec_glob
+from tqdm import tqdm
+from zephyr import Dataset, ZephyrContext
+from zephyr.writers import ensure_parent_dir
+
+CC_IDX_HOST_URL = "http://34.72.201.218:8080"
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class DCLMHQDownloadConfig:
+    input_path: str
+    output_path: str
+
+
+@dataclass
+class FileTask:
+    """Represents a single file processing task."""
+
+    input_file_path: str
+    output_file_path: str
+
+
+def fetch_warc_from_cc(s3_warc_path: str, length: int, offset: int) -> str:
+    """
+    Fetch a WARC record from Common Crawl S3 bucket using byte range requests we get
+    from the CC index via `find_html_in_cc`.
+    Args:
+        s3_warc_path: Path to WARC file in S3 bucket
+        length: Length of the record in bytes
+        offset: Byte offset of the record in the WARC file
+    Returns:
+        The WARC record content as a string
+    """
+    # Convert string values to integers
+    offset = int(offset)
+    length = int(length)
+
+    # Make range request to CommonCrawl
+    response = requests.get(
+        f"https://data.commoncrawl.org/{s3_warc_path}", headers={"Range": f"bytes={offset}-{offset + length - 1}"}
+    )
+    response.raise_for_status()
+
+    # Parse WARC record and extract HTML content
+    with io.BytesIO(response.content) as stream:
+        for record in warcio.ArchiveIterator(stream):
+            content = record.content_stream().read()
+            return content.decode(errors="ignore")
+
+    raise ValueError(f"No WARC records found in response from {s3_warc_path}")
+
+
+def find_html_in_cc(split_id: str, target_uri: str) -> str | None:
+    """
+    We host our own index of the Common Crawl over GCP which we use in this function.
+    For each call we receive a list of chunks that contain the HTML content for the given target URI.
+    We then fetch each chunk and concatenate them together to form the complete HTML content.
+    Args:
+        split_id: The split ID of the Common Crawl
+        target_uri: The target URI to find the HTML content for
+    Returns:
+        The HTML content as a string
+    """
+    resp = requests.get(f"{CC_IDX_HOST_URL}/{split_id}-index?url={target_uri}&output=json")
+
+    resp.raise_for_status()
+
+    chunks = [json.loads(chunk) for chunk in resp.text.split("\n") if chunk]
+    sorted_chunks = sorted(chunks, key=lambda x: x["offset"])
+
+    html_content = ""
+
+    for chunk in sorted_chunks:
+        warc_path = chunk["filename"]
+        length = chunk["length"]
+        offset = chunk["offset"]
+
+        warc_record = fetch_warc_from_cc(warc_path, length, offset)
+
+        html_content += warc_record
+
+    return html_content
+
+
+def process_file(task: FileTask) -> None:
+    """Process a single DCLM file, fetching HTML from Common Crawl.
+
+    Args:
+        task: FileTask containing input and output file paths
+    """
+    logger.info(f"Starting processing of file {task.input_file_path}")
+    logger.info(f"Source: {task.input_file_path}")
+    logger.info(f"Destination: {task.output_file_path}")
+    try:
+        ensure_parent_dir(task.output_file_path)
+        with (
+            open_url(task.input_file_path, compression="zstd") as source,
+            open_url(task.output_file_path, "wt", compression="gzip") as output,
+        ):
+            text_wrapper = io.TextIOWrapper(source, encoding="utf-8")
+
+            for line in tqdm(text_wrapper, desc="Processing lines"):
+                row = json.loads(line.strip())
+
+                # We need to extract the split from where the record was for querying the index
+                # The only place we have this information is in the warcinfo key in DCLM HQ
+                # The format is:
+                # warc-type: WARC/1.1
+                # ...
+                # isPartOf: CC-MAIN-2024-01
+                # This however is a string and not a key-value pair, so we need to extract
+                # the split from it via regex pattern `isPartOf:\s*(CC-MAIN-\d{4}-\d{2})`.
+                # This pattern groups the value of the key `isPartOf` that is of the form
+                # `CC-MAIN-xxxx-xx` where `xxxx` is a year and `xx` is a month.
+                match = re.search(r"isPartOf:\s*(CC-MAIN-\d{4}-\d{2})", row["metadata"]["warcinfo"])
+                if match is None:
+                    logger.error(f"No split found for record ID: {row['metadata']['WARC-Record-ID']}")
+                    continue
+
+                is_part_of = match.group(1)
+
+                try:
+                    html_string = find_html_in_cc(is_part_of, row["metadata"]["WARC-Target-URI"])
+
+                    if html_string is None:
+                        logger.error(f"No HTML found for record ID: {row['metadata']['WARC-Record-ID']}")
+                        continue
+
+                    if "text" in row:
+                        row.pop("text")
+
+                    row["html"] = html_string
+
+                    print(json.dumps(row), file=output)
+                except Exception as e:
+                    logger.exception(f"Error processing line: {e}")
+                    continue
+
+        logger.info("\nProcessing completed successfully!")
+        logger.info(f"File available at: {task.output_file_path}")
+
+    except Exception as e:
+        logger.error(f"Error during processing: {e}")
+        raise
+
+
+def extract_dclm_hq_dump(cfg: DCLMHQDownloadConfig) -> None:
+    """Process the DCLM HQ dump in the input path and save the results to the output path.
+
+    Flattens the nested directory structure (shards → files) into a single list of files
+    and processes them in parallel using zephyr.
+    """
+    logger.info(f"Starting processing of DCLM HQ dump in {cfg.input_path}")
+
+    # Flatten nested structure: discover all files upfront
+    all_files = []
+    paths = [i.split("/")[-1] for i in fsspec_glob(os.path.join(cfg.input_path, "*"))]
+
+    logger.info(f"Found {len(paths)} shards to process")
+
+    for path in paths:
+        input_path = os.path.join(cfg.input_path, path)
+        shard_paths = fsspec_glob(os.path.join(input_path, "*.json.zst"))
+
+        for shard_path in shard_paths:
+            input_file_path = shard_path
+            output_file_path = os.path.join(cfg.output_path, path, os.path.basename(shard_path)).replace(
+                ".json.zst", ".jsonl.gz"
+            )
+
+            all_files.append(FileTask(input_file_path=input_file_path, output_file_path=output_file_path))
+
+    logger.info(f"Found {len(all_files)} files to process")
+
+    # Single-level parallelism over all files
+    pipeline = Dataset.from_list(all_files).map(process_file)
+
+    ctx = ZephyrContext(name="download-dclm-html")
+    ctx.execute(pipeline)
+
+    logger.info("Processing completed successfully!")
+
+
+def dclm_hq_step(
+    name: str = "raw/dclm-hq-html",
+    *,
+    input_path: str,
+    deps: list[StepSpec] | None = None,
+    output_path_prefix: str | None = None,
+    override_output_path: str | None = None,
+) -> StepSpec:
+    """Create a StepSpec that downloads DCLM HQ HTML data from Common Crawl."""
+
+    def _run(output_path: str) -> None:
+        extract_dclm_hq_dump(DCLMHQDownloadConfig(input_path=input_path, output_path=output_path))
+
+    return StepSpec(
+        name=name,
+        fn=_run,
+        deps=deps or [],
+        hash_attrs={"input_path": input_path},
+        output_path_prefix=output_path_prefix,
+        override_output_path=override_output_path,
+    )
diff --git a/lib/marin/src/marin/datakit/download/filesystem.py b/lib/marin/src/marin/datakit/download/filesystem.py
new file mode 100644
index 0000000000..287426666f
--- /dev/null
+++ b/lib/marin/src/marin/datakit/download/filesystem.py
@@ -0,0 +1,101 @@
+# Copyright The Marin Authors
+# SPDX-License-Identifier: Apache-2.0
+
+import os
+import random
+import time
+from dataclasses import dataclass
+
+from iris.marin_fs import url_to_fs
+from marin.execution.step_spec import StepSpec
+from zephyr import Dataset, ZephyrContext
+
+from marin.utils import fsspec_exists, fsspec_glob
+
+
+@dataclass
+class TransferConfig:
+    input_path: str
+    output_path: str
+
+    # Selectively choose the number of random files to transfer. None means all files
+    num_random_files: int | None = None
+    filetype: str = "jsonl.zst"
+
+
+def transfer_files(config: TransferConfig) -> None:
+    """Transfers files from the input path to the output path.
+
+    When num_random_files is None, copies the entire directory recursively.
+    When num_random_files is specified, randomly samples that many files and
+    copies them in parallel using zephyr.
+    """
+    if config.input_path.endswith("/"):
+        input_path = config.input_path[:-1]
+    else:
+        input_path = config.input_path
+
+    print(f"Downloading {input_path} from GCS.")
+    start_time: float = time.time()
+    fs, _ = url_to_fs(input_path)
+    if not fs.exists(input_path):
+        raise FileNotFoundError(f"{input_path} does not exist.")
+
+    # Glob all matching files
+    filenames = fsspec_glob(os.path.join(input_path, f"**/*.{config.filetype}"))
+
+    # Select files: either random sample or all files
+    if config.num_random_files is None:
+        selected_files = filenames
+    else:
+        random.seed(42)
+        random.shuffle(filenames)
+        selected_files = filenames[: config.num_random_files]
+
+    def copy_file(filename: str) -> None:
+        """Copy a single file if it doesn't already exist at destination."""
+        output_filename = os.path.join(config.output_path, os.path.basename(filename))
+        if not fsspec_exists(output_filename):
+            # Ensure output directory exists
+            fs.makedirs(config.output_path, exist_ok=True)
+            fs.copy(filename, output_filename)
+
+    # Always use parallel copying via zephyr
+    pipeline = Dataset.from_list(selected_files).map(copy_file)
+    ctx = ZephyrContext(name="fs-transfer")
+    ctx.execute(pipeline)
+
+    elapsed_time_seconds: float = time.time() - start_time
+    print(f"Downloaded {input_path} to {config.output_path} ({elapsed_time_seconds}s).")
+
+
+def transfer_step(
+    name: str,
+    *,
+    input_path: str,
+    num_random_files: int | None = None,
+    filetype: str = "jsonl.zst",
+    deps: list[StepSpec] | None = None,
+    output_path_prefix: str | None = None,
+    override_output_path: str | None = None,
+) -> StepSpec:
+    """Create a StepSpec that transfers files between fsspec paths."""
+
+    def _run(output_path: str) -> None:
+        transfer_files(
+            TransferConfig(
+                input_path=input_path,
+                output_path=output_path,
+                num_random_files=num_random_files,
+                filetype=filetype,
+            )
+        )
+
+    return StepSpec(
+        name=name,
+        fn=_run,
+        deps=deps or [],
+        hash_attrs={"input_path": input_path, "num_random_files": num_random_files, "filetype": filetype},
+        output_path_prefix=output_path_prefix,
+        override_output_path=override_output_path,
+    )
diff --git a/lib/marin/src/marin/datakit/download/nemotron_cc.py b/lib/marin/src/marin/datakit/download/nemotron_cc.py
new file mode 100644
index 0000000000..4b32983091
--- /dev/null
+++ b/lib/marin/src/marin/datakit/download/nemotron_cc.py
@@ -0,0 +1,142 @@
+# Copyright The Marin Authors
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+Download and process Nemotron-CC dataset from Common Crawl.
+
+Example Usage:
+uv run zephyr --backend=ray --max-parallelism=100 --memory=4GB \
+    lib/marin/src/marin/download/nemotron_cc/download_nemotron_cc.py \
+    --output_path gs://bucket/nemotron-output
+"""
+
+import json
+import logging
+import os
+from collections.abc import Iterator
+from dataclasses import dataclass
+
+import requests
+import zstandard
+from iris.marin_fs import open_url
+from marin.execution import THIS_OUTPUT_PATH
+from marin.execution.step_spec import StepSpec
+from marin.utils import fsspec_exists
+from requests.adapters import HTTPAdapter
+from urllib3.util import Retry
+from zephyr import Dataset, ZephyrContext
+from zephyr.writers import atomic_rename
+
+logger = logging.getLogger(__name__)
+
+myagent = "marin-nemotron-ingress/1.0"
+NCC_PATH_FILE_URL = "https://data.commoncrawl.org/contrib/Nemotron/Nemotron-CC/data-jsonl.paths.gz"
+
+
+def _iter_jsonl_from_zstd_stream(raw_stream) -> Iterator[dict]:
+    """Yield parsed JSON objects from a zstd-compressed JSONL stream."""
+    dctx = zstandard.ZstdDecompressor()
+    with dctx.stream_reader(raw_stream) as reader:
+        buf = bytearray()
+        while True:
+            chunk = reader.read(1048576)
+            if not chunk:
+                break
+            buf.extend(chunk)
+            while True:
+                newline_pos = buf.find(b"\n")
+                if newline_pos < 0:
+                    break
+                line_bytes = bytes(buf[:newline_pos])
+                del buf[: newline_pos + 1]
+                if not line_bytes.strip():
+                    continue
+                yield json.loads(line_bytes)
+
+
+def download_single_nemotron_path(input_file_path: str, output_file_path: str) -> dict:
+    """Fetches content from a Common Crawl path, streaming records to zstd output."""
+    cc_url = f"https://data.commoncrawl.org/{input_file_path}"
+    logger.info(f"Downloading Nemotron CC file {cc_url} to {output_file_path}")
+
+    session = requests.Session()
+    retries = Retry(total=5, backoff_factor=1.0, status_forcelist=[500, 502, 503, 504], allowed_methods=["GET"])
+    adapter = HTTPAdapter(max_retries=retries)
+    session.mount("https://", adapter)
+    session.mount("http://", adapter)
+
+    response = session.get(cc_url, headers={"user-agent": myagent}, stream=True)
+    response.raise_for_status()
+
+    num_records = 0
+    with atomic_rename(output_file_path) as temp_path:
+        with open_url(temp_path, "w", compression="zstd") as out:
+            for record in _iter_jsonl_from_zstd_stream(response.raw):
+                dolma_record = {
+                    "id": record["warc_record_id"],
+                    "text": record["text"],
+                    "source": "nemotron",
+                    "format": "text",
+                    "metadata": {f"nemotron_{k}": v for k, v in record.items() if k not in ("warc_record_id", "text")},
+                }
+                print(json.dumps(dolma_record), file=out)
+                num_records += 1
+
+    return {"input_file": input_file_path, "output_file": output_file_path, "num_records": num_records}
+
+
+@dataclass
+class NemotronIngressConfig:
+    output_path: str = THIS_OUTPUT_PATH
+
+
+def download_nemotron_cc(cfg: NemotronIngressConfig):
+    paths_file_path = os.path.join(cfg.output_path, "data-jsonl.paths")
+    logger.info(f"Downloading Nemotron CC path file {paths_file_path}")
+
+    with open_url(NCC_PATH_FILE_URL, "rb") as f, open_url(paths_file_path, "wb") as f_out:
+        f_out.write(f.read())
+
+    logger.info(f"Reading paths from {paths_file_path}")
+    all_files = []
+    with open_url(paths_file_path, "r", compression="gzip") as f:
+        for line in f:
+            file = line.strip()
+            output_file_path = os.path.join(cfg.output_path, file).replace("jsonl.zstd", "jsonl.zst")
+            all_files.append((file, output_file_path))
+
+    logger.info(f"Processing {len(all_files)} Nemotron CC files")
+
+    pipeline = (
+        Dataset.from_list(all_files)
+        .filter(lambda file_info: not fsspec_exists(file_info[1]))
+        .map(lambda file_info: download_single_nemotron_path(*file_info))
+        .write_jsonl(os.path.join(cfg.output_path, ".metrics/download-{shard:05d}.jsonl"), skip_existing=True)
+    )
+
+    ctx = ZephyrContext(name="download-nemotron-cc")
+    ctx.execute(pipeline)
+
+    logger.info(f"Downloaded Nemotron CC files to {cfg.output_path}")
+
+
+def nemotron_cc_step(
+    name: str = "raw/nemotron-cc",
+    *,
+    deps: list[StepSpec] | None = None,
+    output_path_prefix: str | None = None,
+    override_output_path: str | None = None,
+) -> StepSpec:
+    """Create a StepSpec that downloads the Nemotron-CC dataset from Common Crawl."""
+
+    def _run(output_path: str) -> None:
+        download_nemotron_cc(NemotronIngressConfig(output_path=output_path))
+
+    return StepSpec(
+        name=name,
+        fn=_run,
+        deps=deps or [],
+        hash_attrs={},
+        output_path_prefix=output_path_prefix,
+        override_output_path=override_output_path,
+    )
diff --git a/lib/marin/src/marin/datakit/download/uncheatable_eval.py b/lib/marin/src/marin/datakit/download/uncheatable_eval.py
new file mode 100644
index 0000000000..0bcdef3439
--- /dev/null
+++ b/lib/marin/src/marin/datakit/download/uncheatable_eval.py
@@ -0,0 +1,438 @@
+# Copyright The Marin Authors
+# SPDX-License-Identifier: Apache-2.0
+
+"""Download and normalize the latest Uncheatable Eval data dumps."""
+
+from __future__ import annotations
+
+import json
+import logging
+import os
+import posixpath
+import re
+from collections.abc import Iterable
+from dataclasses import dataclass
+from typing import Any
+
+import requests
+from iris.marin_fs import open_url
+from marin.execution import THIS_OUTPUT_PATH, ExecutorStep, VersionedValue
+from marin.execution.step_spec import StepSpec
+from marin.utils import fsspec_mkdirs
+from requests.adapters import HTTPAdapter
+from urllib3.util import Retry
+from zephyr import Dataset, ZephyrContext
+from zephyr.writers import atomic_rename
+
+logger = logging.getLogger(__name__)
+
+FILENAME_PATTERN = re.compile(r"^(?P<benchmark>.+)_(?P<start>\d{8})to(?P<end>\d{8})(?P<suffix>(?:\.[^.]+)*)$")
+
+TEXT_FIELD_CANDIDATES: tuple[str, ...] = (
+    "text",
+    "body",
+    "content",
+    "article",
+    "document",
+    "raw_text",
+    "code",
+    "message",
+    "description",
+    "story",
+)
+
+LIST_FIELD_CANDIDATES: tuple[str, ...] = (
+    "paragraphs",
+    "sentences",
+    "lines",
+    "messages",
+)
+
+ID_FIELD_CANDIDATES: tuple[str, ...] = (
+    "id",
+    "uuid",
+    "guid",
+    "doc_id",
+    "document_id",
+    "article_id",
+    "hash",
+    "sha",
+    "uid",
+)
+
+
+@dataclass(frozen=True)
+class UncheatableEvalDataset:
+    """Information about a single data dump file from the Uncheatable Eval repository."""
+
+    benchmark: str
+    start_date: str
+    end_date: str
+    name: str
+    download_url: str
+    sha: str | None = None
+    size: int | None = None
+
+    @property
+    def date_range(self) -> str:
+        return f"{self.start_date}to{self.end_date}"
+
+    @property
+    def source_label(self) -> str:
+        return f"{self.benchmark}:{self.date_range}"
+
+    def output_filename(self, suffix: str = ".jsonl.gz") -> str:
+        return f"{self.benchmark}_{self.date_range}{suffix}"
+
+
+@dataclass
+class UncheatableEvalDownloadConfig:
+    """Configuration for downloading and normalizing Uncheatable Eval dumps."""
+
+    output_path: str | VersionedValue[str] = THIS_OUTPUT_PATH
+    repo_owner: str | VersionedValue[str] = "Jellyfish042"
+    repo_name: str | VersionedValue[str] = "uncheatable_eval"
+    data_path: str | VersionedValue[str] = "data"
+    branch: str | VersionedValue[str] = "master"
+    max_concurrent_downloads: int = 8
+    request_timeout: int = 120
+    github_token: str | None = None
+    skip_existing: bool = True
+    metadata_filename: str = "metadata.json"
+
+
+def _http_headers(cfg: UncheatableEvalDownloadConfig) -> dict[str, str]:
+    headers = {"Accept": "application/vnd.github+json"}
+    token = cfg.github_token or os.environ.get("GITHUB_TOKEN")
+    if token:
+        headers["Authorization"] = f"Bearer {token}"
+    return headers
+
+
+def _fetch_directory_listing(cfg: UncheatableEvalDownloadConfig) -> list[dict[str, Any]]:
+    """Return the list of files in the configured GitHub repository directory."""
+
+    headers = _http_headers(cfg)
+    base_url = f"https://api.github.com/repos/{cfg.repo_owner!s}/{cfg.repo_name!s}/contents/{cfg.data_path!s}"
+    params = {"ref": str(cfg.branch)}
+    response = requests.get(base_url, headers=headers, params=params, timeout=cfg.request_timeout)
+    response.raise_for_status()
+    payload = response.json()
+    if not isinstance(payload, list):
+        raise ValueError(f"Unexpected response from GitHub API: {payload!r}")
+    return payload
+
+
+def _parse_available_dumps(entries: Iterable[dict[str, Any]]) -> list[UncheatableEvalDataset]:
+    """Parse GitHub directory entries into dataset metadata."""
+
+    datasets: list[UncheatableEvalDataset] = []
+    for entry in entries:
+        name = entry.get("name")
+        if not isinstance(name, str):
+            continue
+        match = FILENAME_PATTERN.match(name)
+        if not match:
+            continue
+        benchmark = match.group("benchmark")
+        start = match.group("start")
+        end = match.group("end")
+        download_url = entry.get("download_url")
+        if not isinstance(download_url, str):
+            logger.debug("Skipping %s because it has no download_url", name)
+            continue
+        datasets.append(
+            UncheatableEvalDataset(
+                benchmark=benchmark,
+                start_date=start,
+                end_date=end,
+                name=name,
+                download_url=download_url,
+                sha=entry.get("sha"),
+                size=entry.get("size"),
+            )
+        )
+    return datasets
+
+
+def _select_latest_dumps(datasets: Iterable[UncheatableEvalDataset]) -> list[UncheatableEvalDataset]:
+    """Select the latest dump for each benchmark based on the end date (and start date as tie breaker)."""
+
+    latest: dict[str, UncheatableEvalDataset] = {}
+    for dataset in datasets:
+        existing = latest.get(dataset.benchmark)
+        if existing is None:
+            latest[dataset.benchmark] = dataset
+            continue
+        candidate_key = (dataset.end_date, dataset.start_date, dataset.name)
+        existing_key = (existing.end_date, existing.start_date, existing.name)
+        if candidate_key > existing_key:
+            latest[dataset.benchmark] = dataset
+    return sorted(latest.values(), key=lambda d: d.benchmark)
+
+
+def _extract_id(raw: Any, dataset: UncheatableEvalDataset, index: int) -> str:
+    if isinstance(raw, dict):
+        for key in ID_FIELD_CANDIDATES:
+            value = raw.get(key)
+            if value:
+                return str(value)
+        metadata = raw.get("metadata")
+        if isinstance(metadata, dict):
+            for key in ID_FIELD_CANDIDATES:
+                value = metadata.get(key)
+                if value:
+                    return str(value)
+    return f"{dataset.benchmark}_{dataset.date_range}_{index:06d}"
+
+
+def _join_list_field(value: Any) -> str | None:
+    if isinstance(value, list):
+        text_items = [str(item) for item in value if item is not None]
+        if text_items:
+            return "\n".join(text_items)
+    return None
+
+
+def _extract_text(raw: Any) -> str | None:
+    if raw is None:
+        return None
+    if isinstance(raw, str):
+        return raw
+    if isinstance(raw, dict):
+        for key in TEXT_FIELD_CANDIDATES:
+            value = raw.get(key)
+            if isinstance(value, str) and value.strip():
+                return value
+        for key in TEXT_FIELD_CANDIDATES:
+            value = raw.get(key)
+            joined = _join_list_field(value)
+            if joined:
+                return joined
+        for key in LIST_FIELD_CANDIDATES:
+            joined = _join_list_field(raw.get(key))
+            if joined:
+                return joined
+        title = raw.get("title")
+        body = raw.get("body")
+        if isinstance(title, str) and isinstance(body, str):
+            combined = f"{title.strip()}\n\n{body.strip()}"
+            if combined.strip():
+                return combined
+        if isinstance(title, str) and title.strip():
+            return title
+        return json.dumps(raw, ensure_ascii=False)
+    return str(raw)
+
+
+def _normalize_record(raw: Any, dataset: UncheatableEvalDataset, index: int) -> dict[str, str]:
+    text = _extract_text(raw)
+    if text is None or not str(text).strip():
+        raise ValueError(f"Record {index} in {dataset.name} does not contain text")
+    record_id = _extract_id(raw, dataset, index)
+    return {"id": record_id, "text": text, "source": dataset.source_label}
+
+
+def _download_and_convert_single(
+    task: DownloadTask,
+) -> dict[str, Any]:
+    session = requests.Session()
+    retries = Retry(total=5, backoff_factor=1.0, status_forcelist=[500, 502, 503, 504], allowed_methods=["GET"])
+    adapter = HTTPAdapter(max_retries=retries)
+    session.mount("https://", adapter)
+    session.mount("http://", adapter)
+
+    logger.info("Downloading %s from %s", task.dataset.name, task.download_url)
+    response = session.get(task.download_url, timeout=task.cfg.request_timeout, headers=_http_headers(task.cfg))
+    response.raise_for_status()
+
+    try:
+        payload = response.json()
+    except ValueError as exc:
+        raise ValueError(f"Failed to decode JSON payload for {task.dataset.name}") from exc
+
+    if not isinstance(payload, list):
+        raise ValueError(f"Expected list in dataset {task.dataset.name}, found {type(payload).__name__}")
+
+    fsspec_mkdirs(os.path.dirname(task.output_file_path), exist_ok=True)
+
+    record_count = 0
+    with atomic_rename(task.output_file_path) as temp_path:
+        with open_url(temp_path, "wt", encoding="utf-8", compression="gzip") as outfile:
+            for index, raw in enumerate(payload):
+                normalized = _normalize_record(raw, task.dataset, index)
+                json.dump(normalized, outfile, ensure_ascii=False)
+                outfile.write("\n")
+                record_count += 1
+
+    logger.info("Wrote %s records to %s", record_count, task.output_file_path)
+    return {"records": record_count, "output_file": task.output_file_path}
+
+
+@dataclass
+class DownloadTask:
+    download_url: str
+    output_file_path: str
+    dataset: UncheatableEvalDataset
+    cfg: UncheatableEvalDownloadConfig
+
+
+def _generate_tasks(
+    datasets: Iterable[UncheatableEvalDataset],
+    cfg: UncheatableEvalDownloadConfig,
+) -> tuple[list[DownloadTask], list[UncheatableEvalDataset]]:
+    tasks: list[DownloadTask] = []
+    filtered: list[UncheatableEvalDataset] = []
+    for dataset in datasets:
+        output_file = posixpath.join(str(cfg.output_path), dataset.output_filename())
+        tasks.append(DownloadTask(dataset.download_url, output_file, dataset, cfg))
+        filtered.append(dataset)
+    return tasks, filtered
+
+
+def _write_metadata(cfg: UncheatableEvalDownloadConfig, records: list[dict[str, Any]]) -> None:
+    if not records:
+        return
+    metadata_path = posixpath.join(str(cfg.output_path), cfg.metadata_filename)
+    with open_url(metadata_path, "w", encoding="utf-8") as meta_file:
+        json.dump(records, meta_file, indent=2, ensure_ascii=False)
+    logger.info("Wrote metadata to %s", metadata_path)
+
+
+def download_latest_uncheatable_eval(cfg: UncheatableEvalDownloadConfig) -> dict[str, Any]:
+    """Download and normalize the newest Uncheatable Eval dump for each benchmark."""
+
+    entries = _fetch_directory_listing(cfg)
+    datasets = _parse_available_dumps(entries)
+    latest_datasets = _select_latest_dumps(datasets)
+
+    if not latest_datasets:
+        logger.warning("No datasets found that match the expected naming pattern")
+        return {"success": False, "reason": "no_datasets"}
+
+    output_path = str(cfg.output_path)
+    fsspec_mkdirs(output_path, exist_ok=True)
+
+    tasks, filtered_datasets = _generate_tasks(latest_datasets, cfg)
+
+    if not tasks:
+        logger.info("No new datasets to process")
+        return {"success": True, "reason": "already_processed", "skipped": True}
+
+    metadata_records: list[dict[str, Any]] = []
+
+    pipeline = (
+        Dataset.from_list(tasks)
+        .map(lambda task: _download_and_convert_single(task))
+        .write_jsonl(f"{cfg.output_path}/.metrics/part-{{shard:05d}}.jsonl", skip_existing=True)
+    )
+    ctx = ZephyrContext(name="download-uncheatable-eval")
+    output_paths = ctx.execute(pipeline)
+
+    for dataset, metadata_file in zip(filtered_datasets, output_paths, strict=True):
+        with open_url(metadata_file, "r", encoding="utf-8") as meta_file:
+            result = json.load(meta_file)
+
+        try:
+            metadata_records.append(
+                {
+                    "benchmark": dataset.benchmark,
+                    "start_date": dataset.start_date,
+                    "end_date": dataset.end_date,
+                    "source": dataset.source_label,
+                    "output_file": posixpath.join(output_path, dataset.output_filename()),
+                    "records": result.get("records"),
+                    "sha": dataset.sha,
+                    "size": dataset.size,
+                }
+            )
+        except Exception:
+            logger.exception("Failed to process dataset %s", dataset.name)
+            raise
+
+    _write_metadata(cfg, metadata_records)
+    return {"success": True, "processed": metadata_records}
+
+
+def uncheatable_eval_step(
+    name: str = "raw/uncheatable-eval/latest",
+    *,
+    repo_owner: str = "ziqing-huang",
+    repo_name: str = "uncheatable_eval",
+    data_path: str = "data",
+    branch: str = "master",
+    max_concurrent_downloads: int = 8,
+    request_timeout: int = 120,
+    github_token: str | None = None,
+    skip_existing: bool = True,
+    deps: list[StepSpec] | None = None,
+    output_path_prefix: str | None = None,
+    override_output_path: str | None = None,
+) -> StepSpec:
+    """Create a StepSpec that downloads the latest Uncheatable Eval dumps."""
+
+    def _run(output_path: str) -> dict:
+        cfg = UncheatableEvalDownloadConfig(
+            output_path=output_path,
+            repo_owner=repo_owner,
+            repo_name=repo_name,
+            data_path=data_path,
+            branch=branch,
+            max_concurrent_downloads=max_concurrent_downloads,
+            request_timeout=request_timeout,
+            github_token=github_token,
+            skip_existing=skip_existing,
+        )
+        return download_latest_uncheatable_eval(cfg)
+
+    return StepSpec(
+        name=name,
+        fn=_run,
+        deps=deps or [],
+        hash_attrs={
+            "repo_owner": repo_owner,
+            "repo_name": repo_name,
+            "data_path": data_path,
+            "branch": branch,
+        },
+        output_path_prefix=output_path_prefix,
+        override_output_path=override_output_path,
+    )
+
+
+def make_uncheatable_eval_step(
+    *,
+    name: str = "raw/uncheatable-eval/latest",
+    repo_owner: str = "ziqing-huang",
+    repo_name: str = "uncheatable_eval",
+    data_path: str = "data",
+    branch: str = "master",
+    max_concurrent_downloads: int = 8,
+    request_timeout: int = 120,
+    github_token: str | None = None,
+    skip_existing: bool = True,
+) -> ExecutorStep:
+    """Create an ExecutorStep that downloads the latest Uncheatable Eval dumps.
+
+    Backward-compat wrapper around uncheatable_eval_step().
+    """
+    return uncheatable_eval_step(
+        name=name,
+        repo_owner=repo_owner,
+        repo_name=repo_name,
+        data_path=data_path,
+        branch=branch,
+        max_concurrent_downloads=max_concurrent_downloads,
+        request_timeout=request_timeout,
+        github_token=github_token,
+        skip_existing=skip_existing,
+    ).as_executor_step()
+
+
+__all__ = [
+    "UncheatableEvalDataset",
+    "UncheatableEvalDownloadConfig",
+    "download_latest_uncheatable_eval",
+    "make_uncheatable_eval_step",
+    "uncheatable_eval_step",
+]
diff --git a/lib/marin/src/marin/datakit/download/wikipedia.py b/lib/marin/src/marin/datakit/download/wikipedia.py
new file mode 100644
index 0000000000..1dce125a0f
--- /dev/null
+++ b/lib/marin/src/marin/datakit/download/wikipedia.py
@@ -0,0 +1,150 @@
+# Copyright The Marin Authors
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+wikipedia/download.py
+
+Download script for the Wikipedia raw HTML data, provided by Wikimedia.
+
+Home Page: https://dumps.wikimedia.org/other/enterprise_html/runs/
+
+Example Usage (production, large dataset):
+ENWIKI=https://dumps.wikimedia.org/other/enterprise_html/runs/20250320/enwiki-NS0-20250320-ENTERPRISE-HTML.json.tar.gz
+uv run zephyr --backend=ray --max-parallelism=10 \
+    lib/marin/src/marin/download/wikipedia/download.py \
+    --input_urls $ENWIKI \
+    --revision 20250320 --output_path gs://path/to/output
+
+Example Usage (local testing, small dataset):
+SIMPLEWIKI=https://dumps.wikimedia.org/other/enterprise_html/runs/20250320/simplewiki-NS0-20250320-ENTERPRISE-HTML.json.tar.gz
+uv run zephyr --backend=threadpool --max-parallelism=4 --entry-point=download \
+    lib/marin/src/marin/download/wikipedia/download.py \
+    --input_urls "[$SIMPLEWIKI]" \
+    --revision 20250320 --output_path /tmp/wikipedia_test
+
+Note: The enwiki-NS0 file (English Wikipedia, namespace 0 = articles) is approximately 130 GB compressed.
+      The simplewiki-NS0 file (Simple English Wikipedia) is much smaller at ~2 GB compressed.
+"""
+
+import logging
+import os
+import tarfile
+from collections.abc import Iterable
+from dataclasses import dataclass
+
+import draccus
+import requests
+from iris.marin_fs import open_url
+from marin.execution.step_spec import StepSpec
+from marin.utils import fsspec_size
+from tqdm_loggable.auto import tqdm
+from zephyr import Dataset, ZephyrContext, atomic_rename, load_jsonl
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class WikipediaDownloadConfig:
+    input_urls: list[str]
+    revision: str
+    output_path: str
+
+
+def download_tar(url: str, output_prefix) -> str:
+    shard_filename = url.split("/")[-1]
+    output_filename = os.path.join(output_prefix, shard_filename)
+    logger.info(f"Downloading URL: {url} to {output_filename}")
+
+    try:
+        total_size = fsspec_size(url)
+        pbar = tqdm(total=total_size, desc="Downloading File", unit="B", unit_scale=True)
+
+        with atomic_rename(output_filename) as tmp_filename, open_url(tmp_filename, "wb") as f:
+            r = requests.get(url, stream=True)
+
+            for chunk in r.raw.stream(20 * 1024 * 1024, decode_content=False):
+                if chunk:
+                    f.write(chunk)
+                    f.flush()
+
+                    pbar.update(len(chunk))
+
+        return output_filename
+    except Exception as e:
+        logger.error(f"Error downloading URL: {url}")
+        raise e
+
+
+def process_file(input_file: str, output_path: str) -> Iterable[str]:
+    logger.info(f"Processing file: {input_file}")
+    logger.info(f"Output path: {output_path}")
+
+    try:
+        with open_url(input_file) as f:
+            with tarfile.open(fileobj=f, mode="r:gz") as tr:
+                for info in tr:
+                    with tr.extractfile(info) as file:
+                        file_content = file.read()
+                        file_path = os.path.join(output_path, info.name + ".gz")
+
+                    # Each file is a .ndjson file, which contains about 18k-21k articles
+                    # per file with size ranging from 200MB to 300MB
+                    with (
+                        atomic_rename(file_path) as tmpfile_path,
+                        open_url(tmpfile_path, "wb", compression="gzip") as output_f,
+                    ):
+                        output_f.write(file_content)
+                        yield file_path
+
+    except Exception as e:
+        logger.error(f"Error processing file: {input_file}")
+        raise e
+
+
+@draccus.wrap()
+def download(cfg: WikipediaDownloadConfig) -> None:
+    """Download and process Wikipedia data."""
+    logger.info("Starting transfer of Wikipedia dump...")
+    output_base = os.path.join(cfg.output_path, cfg.revision)
+
+    ctx = ZephyrContext(name="download-wikipedia")
+    download_metrics = ctx.execute(
+        Dataset.from_list(cfg.input_urls)
+        .map(lambda url: download_tar(url, output_base))
+        .write_jsonl(f"{output_base}/.metrics/download-{{shard:05d}}.jsonl", skip_existing=True),
+    )
+
+    # load all of the output filenames to process
+    downloads = ctx.execute(Dataset.from_list(download_metrics).flat_map(load_jsonl))
+
+    extracted = ctx.execute(
+        Dataset.from_list(downloads)
+        .flat_map(lambda file: process_file(file, output_base))
+        .write_jsonl(f"{output_base}/.metrics/process-{{shard:05d}}.jsonl", skip_existing=True),
+    )
+
+    logger.info("Wikipedia dump transfer complete, wrote: %s", list(extracted))
+
+
+def wikipedia_step(
+    name: str = "raw/wikipedia",
+    *,
+    input_urls: list[str],
+    revision: str,
+    deps: list[StepSpec] | None = None,
+    output_path_prefix: str | None = None,
+    override_output_path: str | None = None,
+) -> StepSpec:
+    """Create a StepSpec that downloads and processes Wikipedia HTML dumps."""
+
+    def _run(output_path: str) -> None:
+        download(WikipediaDownloadConfig(input_urls=input_urls, revision=revision, output_path=output_path))
+
+    return StepSpec(
+        name=name,
+        fn=_run,
+        deps=deps or [],
+        hash_attrs={"input_urls": input_urls, "revision": revision},
+        output_path_prefix=output_path_prefix,
+        override_output_path=override_output_path,
+    )

From df63b150e47afcb414756c562437989f25dd1e5f Mon Sep 17 00:00:00 2001
From: Rafal Wojdyla <ravwojdyla@gmail.com>
Date: Wed, 25 Mar 2026 10:14:52 -0700
Subject: [PATCH 06/56] Convert marin.download.* to backward-compat re-export
 shims

All download module implementations now live in marin.datakit.download.*.
The old marin.download.* files are replaced with explicit re-exports from
the canonical locations. Renamed configs (Ar5ivDownloadConfig,
WikipediaDownloadConfig) are re-exported under their original names
for backward compat.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 lib/marin/src/marin/download/__init__.py      |   7 +-
 .../src/marin/download/ar5iv/__init__.py      |   2 +
 .../src/marin/download/ar5iv/download.py      | 136 +-----
 .../src/marin/download/dclm_hq/__init__.py    |   2 +
 .../download/dclm_hq/download_dclm_hq_html.py | 214 +---------
 .../src/marin/download/filesystem/__init__.py |   2 +
 .../src/marin/download/filesystem/transfer.py |  68 +--
 .../marin/download/huggingface/__init__.py    |   2 +
 .../marin/download/huggingface/download_hf.py | 353 +---------------
 .../huggingface/stream_remove_columns.py      | 104 +----
 .../download/huggingface/upload_gcs_to_hf.py  | 362 +---------------
 .../marin/download/nemotron_cc/__init__.py    |   2 +
 .../nemotron_cc/download_nemotron_cc.py       | 120 +-----
 .../download/uncheatable_eval/__init__.py     |   2 +
 .../download/uncheatable_eval/download.py     | 396 +-----------------
 .../src/marin/download/wikipedia/__init__.py  |   2 +
 .../src/marin/download/wikipedia/download.py  | 126 +-----
 17 files changed, 64 insertions(+), 1836 deletions(-)
 create mode 100644 lib/marin/src/marin/download/ar5iv/__init__.py
 create mode 100644 lib/marin/src/marin/download/dclm_hq/__init__.py
 create mode 100644 lib/marin/src/marin/download/filesystem/__init__.py
 create mode 100644 lib/marin/src/marin/download/huggingface/__init__.py
 create mode 100644 lib/marin/src/marin/download/nemotron_cc/__init__.py
 create mode 100644 lib/marin/src/marin/download/uncheatable_eval/__init__.py
 create mode 100644 lib/marin/src/marin/download/wikipedia/__init__.py

diff --git a/lib/marin/src/marin/download/__init__.py b/lib/marin/src/marin/download/__init__.py
index b5a56a002d..26067cbf97 100644
--- a/lib/marin/src/marin/download/__init__.py
+++ b/lib/marin/src/marin/download/__init__.py
@@ -1,6 +1,7 @@
 # Copyright The Marin Authors
 # SPDX-License-Identifier: Apache-2.0
+# Backward-compat shim. Canonical location: marin.datakit.download
 
-from .huggingface.download_hf import DownloadConfig as HfDownloadConfig
-from .huggingface.download_hf import download_hf
-from .huggingface.download_hf import download_hf as download_hf_ungated
+from marin.datakit.download.huggingface import DownloadConfig as HfDownloadConfig
+from marin.datakit.download.huggingface import download_hf
+from marin.datakit.download.huggingface import download_hf as download_hf_ungated
diff --git a/lib/marin/src/marin/download/ar5iv/__init__.py b/lib/marin/src/marin/download/ar5iv/__init__.py
new file mode 100644
index 0000000000..ec8bc038b7
--- /dev/null
+++ b/lib/marin/src/marin/download/ar5iv/__init__.py
@@ -0,0 +1,2 @@
+# Copyright The Marin Authors
+# SPDX-License-Identifier: Apache-2.0
diff --git a/lib/marin/src/marin/download/ar5iv/download.py b/lib/marin/src/marin/download/ar5iv/download.py
index 9483370c71..1a64dbf93e 100644
--- a/lib/marin/src/marin/download/ar5iv/download.py
+++ b/lib/marin/src/marin/download/ar5iv/download.py
@@ -1,135 +1,7 @@
 # Copyright The Marin Authors
 # SPDX-License-Identifier: Apache-2.0
+# Backward-compat shim. Canonical location: marin.datakit.download.ar5iv
 
-"""
-Download and process Ar5iv dataset from a zip file.
-
-Example Usage:
-uv run zephyr --backend=ray --max-parallelism=1000 --memory=10GB \
-    lib/marin/src/marin/download/ar5iv/download.py \
-    --input_path gs://bucket/ar5iv.zip \
-    --output_path gs://bucket/output
-"""
-
-import json
-import logging
-import zipfile
-from collections import defaultdict
-from dataclasses import dataclass
-
-import draccus
-from iris.marin_fs import open_url
-from zephyr import Dataset, ZephyrContext
-from zephyr.writers import atomic_rename
-from iris.logging import configure_logging
-
-logger = logging.getLogger(__name__)
-
-
-@dataclass
-class DownloadConfig:
-    input_path: str
-    output_path: str
-    max_files: int | None = None  # Maximum number of shards to process
-
-
-def process_shard(shard_task: dict) -> dict:
-    """
-    Process a single shard by extracting its files from the zip in GCS and uploading the merged JSONL.
-
-    Args:
-        shard_task: Dict with keys 'input_path', 'output_path', 'shard_id', 'file_list'
-    """
-    input_path = shard_task["input_path"]
-    output_path = shard_task["output_path"]
-    shard_id = shard_task["shard_id"]
-    file_list = shard_task["file_list"]
-    gcs_path = f"{output_path}/{shard_id}.jsonl.gz"
-
-    with open_url(str(input_path), "rb") as f:
-        with zipfile.ZipFile(f) as zf:
-            with atomic_rename(gcs_path) as temp_path, open_url(temp_path, "wt", compression="gzip") as out_f:
-                for filename in file_list:
-                    with zf.open(filename, "r") as file_handle:
-                        content = file_handle.read()
-                        record = {
-                            "filename": filename,
-                            "format": "html",
-                            "content": content.decode("utf-8", errors="replace"),
-                        }
-                        print(json.dumps(record), file=out_f)
-
-            logger.info(f"Shard {shard_id} with {len(file_list)} files uploaded to {gcs_path}")
-            return {"shard_id": shard_id, "num_files": len(file_list), "output_path": gcs_path}
-
-
-def download(cfg: DownloadConfig) -> None:
-    """
-    Download and process Ar5iv dataset from a zip file in GCS.
-
-    This function can be called by the executor framework or used standalone.
-    """
-    logger.info("Starting transfer of Ar5iv dataset...")
-    logger.info(f"Source: {cfg.input_path}")
-
-    # Use fsspec+zipfile to list all files
-    with open_url(str(cfg.input_path), "rb") as f:
-        with zipfile.ZipFile(f) as zf:
-            all_files = zf.infolist()
-
-            # Group by shard directory
-            # We assume structure: something like: shard_id/.../file
-            # shard_id is derived from the second last component if files are nested.
-            # Adjust as needed if directory structure differs.
-            shard_dict = defaultdict(list)
-            for info in all_files:
-                if info.is_dir():
-                    continue
-                # E.g. path might look like: "003/something.html"
-                # Extract shard_id from the directory:
-                # Split by "/" and take the first part if we assume structure {shard_id}/file
-                parts = info.filename.strip("/").split("/")
-                if len(parts) < 2:
-                    # File at root level - decide how to handle this case.
-                    # If no directory structure is given, skip or treat differently.
-                    continue
-                shard_id = parts[-2]  # get the second-last directory as shard_id
-                shard_dict[shard_id].append(info.filename)
-
-            # Apply max_files limit if provided
-            shard_ids = list(shard_dict.keys())
-            if cfg.max_files is not None:
-                shard_ids = shard_ids[: cfg.max_files]
-
-            logger.info(f"Found {len(shard_ids)} shards to process.")
-
-            # Build task list for each shard
-            shard_tasks = []
-            for shard_id in shard_ids:
-                shard_tasks.append(
-                    {
-                        "input_path": cfg.input_path,
-                        "output_path": cfg.output_path,
-                        "shard_id": shard_id,
-                        "file_list": shard_dict[shard_id],
-                    }
-                )
-
-    # Execute pipeline with zephyr
-    pipeline = (
-        Dataset.from_list(shard_tasks)
-        .map(process_shard)
-        .write_jsonl(f"{cfg.output_path}/.metrics/part-{{shard:05d}}.jsonl", skip_existing=True)
-    )
-    ctx = ZephyrContext(name="download-ar5iv")
-    ctx.execute(pipeline)
-
-    logger.info("Transfer completed successfully!")
-
-
-@draccus.wrap()
-def main(cfg: DownloadConfig) -> None:
-    """CLI entrypoint for downloading and processing Ar5iv dataset."""
-
-    configure_logging(level=logging.INFO)
-    download(cfg)
+from marin.datakit.download.ar5iv import Ar5ivDownloadConfig as DownloadConfig  # noqa: F401 - used by tests
+from marin.datakit.download.ar5iv import download as download
+from marin.datakit.download.ar5iv import process_shard as process_shard
diff --git a/lib/marin/src/marin/download/dclm_hq/__init__.py b/lib/marin/src/marin/download/dclm_hq/__init__.py
new file mode 100644
index 0000000000..ec8bc038b7
--- /dev/null
+++ b/lib/marin/src/marin/download/dclm_hq/__init__.py
@@ -0,0 +1,2 @@
+# Copyright The Marin Authors
+# SPDX-License-Identifier: Apache-2.0
diff --git a/lib/marin/src/marin/download/dclm_hq/download_dclm_hq_html.py b/lib/marin/src/marin/download/dclm_hq/download_dclm_hq_html.py
index 9250ede43d..a49caab9d7 100644
--- a/lib/marin/src/marin/download/dclm_hq/download_dclm_hq_html.py
+++ b/lib/marin/src/marin/download/dclm_hq/download_dclm_hq_html.py
@@ -1,208 +1,10 @@
 # Copyright The Marin Authors
 # SPDX-License-Identifier: Apache-2.0
-
-"""
-Download DCLM HQ HTML data by fetching HTML content from Common Crawl.
-
-Processes DCLM HQ JSONL files and enriches them with HTML content fetched from Common Crawl
-via a custom index server. Uses zephyr for parallel processing with flattened parallelism.
-
-Example Usage:
-uv run zephyr --backend=ray --max-parallelism=800 --memory=2GB \
-    lib/marin/src/marin/download/dclm_hq/download_dclm_hq_html.py \
-    --input_path gs://marin-us-central2/raw/dclm-baseline-1.0-parquet/global/ \
-    --output_path gs://marin-data/processed/dclm-hq-html/
-"""
-
-import io
-import json
-import logging
-import os
-import re
-from dataclasses import dataclass
-
-import requests
-from iris.marin_fs import open_url
-import warcio
-from marin.utils import fsspec_glob
-from tqdm import tqdm
-from zephyr import Dataset, ZephyrContext
-from zephyr.writers import ensure_parent_dir
-
-CC_IDX_HOST_URL = "http://34.72.201.218:8080"
-logger = logging.getLogger(__name__)
-
-
-@dataclass
-class DCLMHQDownloadConfig:
-    input_path: str
-    output_path: str
-
-
-@dataclass
-class FileTask:
-    """Represents a single file processing task."""
-
-    input_file_path: str
-    output_file_path: str
-
-
-def fetch_warc_from_cc(s3_warc_path: str, length: int, offset: int) -> str:
-    """
-    Fetch a WARC record from Common Crawl S3 bucket using byte range requests we get
-    from the CC index via `find_html_in_cc`.
-    Args:
-        s3_warc_path: Path to WARC file in S3 bucket
-        length: Length of the record in bytes
-        offset: Byte offset of the record in the WARC file
-    Returns:
-        The WARC record content as a string
-    """
-    # Convert string values to integers
-    offset = int(offset)
-    length = int(length)
-
-    # Make range request to CommonCrawl
-    response = requests.get(
-        f"https://data.commoncrawl.org/{s3_warc_path}", headers={"Range": f"bytes={offset}-{offset + length - 1}"}
-    )
-    response.raise_for_status()
-
-    # Parse WARC record and extract HTML content
-    with io.BytesIO(response.content) as stream:
-        for record in warcio.ArchiveIterator(stream):
-            content = record.content_stream().read()
-            return content.decode(errors="ignore")
-
-    raise ValueError(f"No WARC records found in response from {s3_warc_path}")
-
-
-def find_html_in_cc(split_id: str, target_uri: str) -> str | None:
-    """
-    We host our own index of the Common Crawl over GCP which we use in this function.
-    For each call we receive a list of chunks that contain the HTML content for the given target URI.
-    We then fetch each chunk and concatenate them together to form the complete HTML content.
-    Args:
-        split_id: The split ID of the Common Crawl
-        target_uri: The target URI to find the HTML content for
-    Returns:
-        The HTML content as a string
-    """
-    resp = requests.get(f"{CC_IDX_HOST_URL}/{split_id}-index?url={target_uri}&output=json")
-
-    resp.raise_for_status()
-
-    chunks = [json.loads(chunk) for chunk in resp.text.split("\n") if chunk]
-    sorted_chunks = sorted(chunks, key=lambda x: x["offset"])
-
-    html_content = ""
-
-    for chunk in sorted_chunks:
-        warc_path = chunk["filename"]
-        length = chunk["length"]
-        offset = chunk["offset"]
-
-        warc_record = fetch_warc_from_cc(warc_path, length, offset)
-
-        html_content += warc_record
-
-    return html_content
-
-
-def process_file(task: FileTask) -> None:
-    """Process a single DCLM file, fetching HTML from Common Crawl.
-
-    Args:
-        task: FileTask containing input and output file paths
-    """
-    logger.info(f"Starting processing of file {task.input_file_path}")
-    logger.info(f"Source: {task.input_file_path}")
-    logger.info(f"Destination: {task.output_file_path}")
-    try:
-        ensure_parent_dir(task.output_file_path)
-        with (
-            open_url(task.input_file_path, compression="zstd") as source,
-            open_url(task.output_file_path, "wt", compression="gzip") as output,
-        ):
-            text_wrapper = io.TextIOWrapper(source, encoding="utf-8")
-
-            for line in tqdm(text_wrapper, desc="Processing lines"):
-                row = json.loads(line.strip())
-
-                # We need to extract the split from where the record was for querying the index
-                # The only place we have this information is in the warcinfo key in DCLM HQ
-                # The format is:
-                # warc-type: WARC/1.1
-                # ...
-                # isPartOf: CC-MAIN-2024-01
-                # This however is a string and not a key-value pair, so we need to extract
-                # the split from it via regex pattern `isPartOf:\s*(CC-MAIN-\d{4}-\d{2})`.
-                # This pattern groups the value of the key `isPartOf` that is of the form
-                # `CC-MAIN-xxxx-xx` where `xxxx` is a year and `xx` is a month.
-                match = re.search(r"isPartOf:\s*(CC-MAIN-\d{4}-\d{2})", row["metadata"]["warcinfo"])
-                if match is None:
-                    logger.error(f"No split found for record ID: {row['metadata']['WARC-Record-ID']}")
-                    continue
-
-                is_part_of = match.group(1)
-
-                try:
-                    html_string = find_html_in_cc(is_part_of, row["metadata"]["WARC-Target-URI"])
-
-                    if html_string is None:
-                        logger.error(f"No HTML found for record ID: {row['metadata']['WARC-Record-ID']}")
-                        continue
-
-                    if "text" in row:
-                        row.pop("text")
-
-                    row["html"] = html_string
-
-                    print(json.dumps(row), file=output)
-                except Exception as e:
-                    logger.exception(f"Error processing line: {e}")
-                    continue
-
-        logger.info("\nProcessing completed successfully!")
-        logger.info(f"File available at: {task.output_file_path}")
-
-    except Exception as e:
-        logger.error(f"Error during processing: {e}")
-        raise
-
-
-def extract_dclm_hq_dump(cfg: DCLMHQDownloadConfig) -> None:
-    """Process the DCLM HQ dump in the input path and save the results to the output path.
-
-    Flattens the nested directory structure (shards → files) into a single list of files
-    and processes them in parallel using zephyr.
-    """
-    logger.info(f"Starting processing of DCLM HQ dump in {cfg.input_path}")
-
-    # Flatten nested structure: discover all files upfront
-    all_files = []
-    paths = [i.split("/")[-1] for i in fsspec_glob(os.path.join(cfg.input_path, "*"))]
-
-    logger.info(f"Found {len(paths)} shards to process")
-
-    for path in paths:
-        input_path = os.path.join(cfg.input_path, path)
-        shard_paths = fsspec_glob(os.path.join(input_path, "*.json.zst"))
-
-        for shard_path in shard_paths:
-            input_file_path = shard_path
-            output_file_path = os.path.join(cfg.output_path, path, os.path.basename(shard_path)).replace(
-                ".json.zst", ".jsonl.gz"
-            )
-
-            all_files.append(FileTask(input_file_path=input_file_path, output_file_path=output_file_path))
-
-    logger.info(f"Found {len(all_files)} files to process")
-
-    # Single-level parallelism over all files
-    pipeline = Dataset.from_list(all_files).map(process_file)
-
-    ctx = ZephyrContext(name="download-dclm-html")
-    ctx.execute(pipeline)
-
-    logger.info("Processing completed successfully!")
+# Backward-compat shim. Canonical location: marin.datakit.download.dclm_hq
+
+from marin.datakit.download.dclm_hq import DCLMHQDownloadConfig as DCLMHQDownloadConfig
+from marin.datakit.download.dclm_hq import FileTask as FileTask
+from marin.datakit.download.dclm_hq import extract_dclm_hq_dump as extract_dclm_hq_dump
+from marin.datakit.download.dclm_hq import fetch_warc_from_cc as fetch_warc_from_cc
+from marin.datakit.download.dclm_hq import find_html_in_cc as find_html_in_cc
+from marin.datakit.download.dclm_hq import process_file as process_file
diff --git a/lib/marin/src/marin/download/filesystem/__init__.py b/lib/marin/src/marin/download/filesystem/__init__.py
new file mode 100644
index 0000000000..ec8bc038b7
--- /dev/null
+++ b/lib/marin/src/marin/download/filesystem/__init__.py
@@ -0,0 +1,2 @@
+# Copyright The Marin Authors
+# SPDX-License-Identifier: Apache-2.0
diff --git a/lib/marin/src/marin/download/filesystem/transfer.py b/lib/marin/src/marin/download/filesystem/transfer.py
index e28a6667d8..5456bf8cc5 100644
--- a/lib/marin/src/marin/download/filesystem/transfer.py
+++ b/lib/marin/src/marin/download/filesystem/transfer.py
@@ -1,68 +1,6 @@
 # Copyright The Marin Authors
 # SPDX-License-Identifier: Apache-2.0
+# Backward-compat shim. Canonical location: marin.datakit.download.filesystem
 
-import os
-import random
-import time
-from dataclasses import dataclass
-
-from iris.marin_fs import url_to_fs
-from zephyr import Dataset, ZephyrContext
-
-from marin.utils import fsspec_exists, fsspec_glob
-
-
-@dataclass
-class TransferConfig:
-    input_path: str
-    output_path: str
-
-    # Selectively choose the number of random files to transfer. None means all files
-    num_random_files: int | None = None
-    filetype: str = "jsonl.zst"
-
-
-def transfer_files(config: TransferConfig) -> None:
-    """Transfers files from the input path to the output path.
-
-    When num_random_files is None, copies the entire directory recursively.
-    When num_random_files is specified, randomly samples that many files and
-    copies them in parallel using zephyr.
-    """
-    if config.input_path.endswith("/"):
-        input_path = config.input_path[:-1]
-    else:
-        input_path = config.input_path
-
-    print(f"Downloading {input_path} from GCS.")
-    start_time: float = time.time()
-    fs, _ = url_to_fs(input_path)
-    if not fs.exists(input_path):
-        raise FileNotFoundError(f"{input_path} does not exist.")
-
-    # Glob all matching files
-    filenames = fsspec_glob(os.path.join(input_path, f"**/*.{config.filetype}"))
-
-    # Select files: either random sample or all files
-    if config.num_random_files is None:
-        selected_files = filenames
-    else:
-        random.seed(42)
-        random.shuffle(filenames)
-        selected_files = filenames[: config.num_random_files]
-
-    def copy_file(filename: str) -> None:
-        """Copy a single file if it doesn't already exist at destination."""
-        output_filename = os.path.join(config.output_path, os.path.basename(filename))
-        if not fsspec_exists(output_filename):
-            # Ensure output directory exists
-            fs.makedirs(config.output_path, exist_ok=True)
-            fs.copy(filename, output_filename)
-
-    # Always use parallel copying via zephyr
-    pipeline = Dataset.from_list(selected_files).map(copy_file)
-    ctx = ZephyrContext(name="fs-transfer")
-    ctx.execute(pipeline)
-
-    elapsed_time_seconds: float = time.time() - start_time
-    print(f"Downloaded {input_path} to {config.output_path} ({elapsed_time_seconds}s).")
+from marin.datakit.download.filesystem import TransferConfig as TransferConfig
+from marin.datakit.download.filesystem import transfer_files as transfer_files
diff --git a/lib/marin/src/marin/download/huggingface/__init__.py b/lib/marin/src/marin/download/huggingface/__init__.py
new file mode 100644
index 0000000000..ec8bc038b7
--- /dev/null
+++ b/lib/marin/src/marin/download/huggingface/__init__.py
@@ -0,0 +1,2 @@
+# Copyright The Marin Authors
+# SPDX-License-Identifier: Apache-2.0
diff --git a/lib/marin/src/marin/download/huggingface/download_hf.py b/lib/marin/src/marin/download/huggingface/download_hf.py
index 089ef63e0c..9912a5d2c0 100644
--- a/lib/marin/src/marin/download/huggingface/download_hf.py
+++ b/lib/marin/src/marin/download/huggingface/download_hf.py
@@ -1,353 +1,12 @@
-#!/usr/bin/env python3
 # Copyright The Marin Authors
 # SPDX-License-Identifier: Apache-2.0
+# Backward-compat shim. Canonical location: marin.datakit.download.huggingface
 
-"""
-A script to download a HuggingFace dataset and upload it to a specified fsspec path,
-using HfFileSystem for direct streaming of data transfer.
-"""
-
-import logging
-import os
-import random
-import socket
-import time
-from dataclasses import dataclass, field
-
-import draccus
-import huggingface_hub
-from huggingface_hub import HfFileSystem
-from iris.marin_fs import open_url, url_to_fs
-from huggingface_hub.errors import HfHubHTTPError
-from packaging.version import Version
-from marin.execution.executor import THIS_OUTPUT_PATH
-from marin.utilities.validation_utils import write_provenance_json
-from zephyr import Dataset, ZephyrContext
-from zephyr.writers import atomic_rename
-from iris.logging import configure_logging
-
-logger = logging.getLogger(__name__)
-
-HF_PROTOCOL_PREFIX = "hf://"
-HF_BUCKET_PATH_PREFIX = "buckets/"
-
-
-@dataclass(frozen=True)
-class DownloadConfig:
-    # fmt: off
-
-    # HuggingFace Dataset Parameters
-    hf_dataset_id: str                                      # HF Dataset to Download (as `$ORG/$DATASET` on HF Hub)
-
-    revision: str  # (Short) Commit Hash (from HF Dataset Repo; 7 characters)
-    hf_urls_glob: list[str] = field(default_factory=list)
-    # List of Glob Patterns to Match Files in HF Dataset, If empty we get all the files in a hf repo
-
-    gcs_output_path: str = THIS_OUTPUT_PATH
-    """
-    Path to store raw data in persistent storage (e.g. gs://$BUCKET/...).
-    This works with any fsspec-compatible path, but for backwards compatibility, we call it gcs_output_path.
-    """
-
-    append_sha_to_path: bool = False
-    """If true, write outputs under ``gcs_output_path/<revision>`` instead of directly under ``gcs_output_path``."""
-
-    # Job Control Parameters, used only for non-gated dataset transfers done via STS
-    wait_for_completion: bool = True                        # if True, will block until job completes
-
-    # fmt: on
-    hf_repo_type_prefix: str = (
-        "datasets"  # The repo_type_prefix is datasets/ for datasets,
-        # spaces/ for spaces, and models do not need a prefix in the URL.
-    )
-
-    zephyr_max_parallelism: int = 8
-    """Maximum parallelism of the Zephyr download job"""
-
-    read_timeout_seconds: float = 120.0
-    """Socket read timeout while streaming each HF file. Timeout failures trigger retries."""
-
-    progress_log_interval_seconds: float = 60.0
-    """Log a heartbeat for each in-flight shard every N seconds while bytes are flowing."""
-
-    read_chunk_size_mib: int = 8
-    """Chunk size for each streaming read from HF."""
-
-
-def _strip_hf_protocol(path: str) -> str:
-    return path.removeprefix(HF_PROTOCOL_PREFIX).lstrip("/")
-
-
-def _resolve_hf_source_path(cfg: DownloadConfig) -> str:
-    source_path = (
-        os.path.join(cfg.hf_repo_type_prefix, cfg.hf_dataset_id) if cfg.hf_repo_type_prefix else cfg.hf_dataset_id
-    )
-    return _strip_hf_protocol(source_path)
-
-
-def _assert_bucket_support_available(source_path: str) -> None:
-    if not source_path.startswith(HF_BUCKET_PATH_PREFIX):
-        return
-
-    if Version(huggingface_hub.__version__) < Version("1.6.0"):
-        raise RuntimeError(
-            f"Bucket paths require huggingface_hub>=1.6.0, found {huggingface_hub.__version__}. "
-            "Upgrade the runtime environment to a buckets-capable huggingface_hub version."
-        )
-
-
-def _relative_path_in_source(file_path: str, source_path: str) -> str:
-    normalized_file = _strip_hf_protocol(file_path)
-    normalized_source = _strip_hf_protocol(source_path).rstrip("/")
-
-    source_prefix = f"{normalized_source}/"
-    if normalized_file.startswith(source_prefix):
-        return normalized_file.removeprefix(source_prefix)
-
-    source_parts = [segment for segment in normalized_source.split("/") if segment]
-    file_parts = [segment for segment in normalized_file.split("/") if segment]
-
-    if len(file_parts) >= len(source_parts):
-        matches_source = True
-        for source_segment, file_segment in zip(source_parts, file_parts, strict=False):
-            if source_segment == file_segment:
-                continue
-            if file_segment.split("@", 1)[0] == source_segment:
-                continue
-            matches_source = False
-            break
-
-        if matches_source:
-            return "/".join(file_parts[len(source_parts) :])
-
-    # Backwards-compatible fallback for historical dataset path layout.
-    return normalized_file.split("/", 3)[-1]
-
-
-def ensure_fsspec_path_writable(output_path: str) -> None:
-    """Check if the fsspec path is writable by trying to create and delete a temporary file."""
-    fs, _ = url_to_fs(output_path)
-    try:
-        fs.mkdirs(output_path, exist_ok=True)
-        test_path = os.path.join(output_path, "test_write_access")
-        with fs.open(test_path, "w") as f:
-            f.write("test")
-        fs.rm(test_path)
-    except Exception as e:
-        raise ValueError(f"No write access to fsspec path: {output_path} ({e})") from e
-
-
-def stream_file_to_fsspec(
-    gcs_output_path: str,
-    file_path: str,
-    fsspec_file_path: str,
-    expected_size: int | None = None,
-    read_timeout_seconds: float = 120.0,
-    progress_log_interval_seconds: float = 60.0,
-    read_chunk_size_mib: int = 8,
-):
-    """Stream a file from HfFileSystem to another fsspec path using atomic write.
-
-    Uses atomic_rename to write to a temp file first, then rename on success.
-    This enables recovery across individual files if the job is interrupted.
-
-    Args:
-        gcs_output_path: Base output path for the download.
-        file_path: Source file path on HuggingFace.
-        fsspec_file_path: Target file path on the destination filesystem.
-        expected_size: Expected file size in bytes for validation. If provided,
-            the download will fail if the downloaded size doesn't match.
-    """
-    hf_fs = HfFileSystem(token=os.environ.get("HF_TOKEN", False))
-    target_fs, _ = url_to_fs(gcs_output_path)
-    chunk_size = max(1, int(read_chunk_size_mib)) * 1024 * 1024
-    max_retries = 20
-    # 15 minutes max sleep
-    max_sleep = 15 * 60
-    # Minimum base wait time to avoid too-fast retries
-    min_base_wait = 5
-
-    # Retry when there is an error, such as hf rate limit
-    last_exception = None
-    for attempt in range(max_retries):
-        try:
-            target_fs.mkdirs(os.path.dirname(fsspec_file_path), exist_ok=True)
-            bytes_written = 0
-            with atomic_rename(fsspec_file_path) as temp_path:
-                previous_socket_timeout = socket.getdefaulttimeout()
-                socket.setdefaulttimeout(read_timeout_seconds)
-                try:
-                    with (
-                        hf_fs.open(file_path, "rb", block_size=chunk_size) as src_file,
-                        open_url(temp_path, "wb") as dest_file,
-                    ):
-                        start_time = time.monotonic()
-                        next_progress_log = start_time + progress_log_interval_seconds
-                        while True:
-                            try:
-                                chunk = src_file.read(chunk_size)
-                            except TimeoutError as timeout_error:
-                                raise TimeoutError(
-                                    f"Timed out reading from {file_path} after "
-                                    f"{read_timeout_seconds:.1f}s with {bytes_written} bytes written"
-                                ) from timeout_error
-                            if not chunk:
-                                break
-                            dest_file.write(chunk)
-                            bytes_written += len(chunk)
-                            now = time.monotonic()
-                            if progress_log_interval_seconds > 0 and now >= next_progress_log:
-                                elapsed = max(now - start_time, 1e-9)
-                                speed_mib_s = (bytes_written / (1024**2)) / elapsed
-                                logger.info(
-                                    f"Streaming {file_path}: {bytes_written / (1024**2):.1f} MiB written "
-                                    f"in {elapsed:.1f}s ({speed_mib_s:.2f} MiB/s)"
-                                )
-                                next_progress_log = now + progress_log_interval_seconds
-                finally:
-                    socket.setdefaulttimeout(previous_socket_timeout)
-
-                # Validate file size BEFORE atomic_rename commits the file
-                if expected_size is not None and bytes_written != expected_size:
-                    raise ValueError(
-                        f"Size mismatch for {file_path}: expected {expected_size} bytes, got {bytes_written} bytes"
-                    )
-
-            logger.info(f"Streamed {file_path} successfully to {fsspec_file_path} ({bytes_written} bytes)")
-            return {"file_path": file_path, "status": "success", "size": bytes_written}
-        except Exception as e:
-            last_exception = e
-            # Base wait: min 5s, then exponential: 5, 10, 20, 40, 80, 160, 320, 600 (capped)
-            wait_base = max(min_base_wait, min_base_wait * (2**attempt))
-
-            error_type = type(e).__name__
-            error_msg = str(e)
-            status_code = -1
-
-            if isinstance(e, HfHubHTTPError):
-                status_code = e.response.status_code
-                TOO_MANY_REQUESTS = 429
-                if status_code == TOO_MANY_REQUESTS:
-                    # NOTE: RateLimit "api\|pages\|resolvers";r=[remaining];t=[seconds remaining until reset]
-                    try:
-                        rate_limit_wait = int(e.response.headers["RateLimit"].split(";")[-1].split("=")[-1])
-                        wait_base = max(wait_base, rate_limit_wait + 10)  # Add buffer to rate limit wait
-                    except Exception:
-                        logger.warning("Failed to parse rate limit header, using default wait period")
-
-            logger.warning(
-                f"Attempt {attempt + 1}/{max_retries} failed for {file_path}: "
-                f"{error_type} (status={status_code}): {error_msg}"
-            )
-
-            jitter = random.uniform(0, min(wait_base * 0.25, 30))  # Up to 25% jitter, max 30s
-            wait_time = min(wait_base + jitter, max_sleep)
-
-            logger.info(f"Retrying {file_path} in {wait_time:.1f}s...")
-            time.sleep(wait_time)
-
-    raise RuntimeError(
-        f"Failed to download {file_path} after {max_retries} attempts. "
-        f"Last error: {type(last_exception).__name__}: {last_exception}"
-    )
-
-
-def download_hf(cfg: DownloadConfig) -> None:
-
-    configure_logging(level=logging.INFO)
-
-    # Set cfg.append_sha_to_path=True to mimic the older behavior of writing to gcs_output_path/<revision>.
-    # Some historical datasets were written that way, so this flag keeps backwards compatibility when needed.
-
-    # Ensure the output path is writable
-    try:
-        output_path = os.path.join(cfg.gcs_output_path, cfg.revision) if cfg.append_sha_to_path else cfg.gcs_output_path
-        ensure_fsspec_path_writable(output_path)
-    except ValueError as e:
-        logger.exception(f"Output path validation failed: {e}")
-        raise e
-
-    # Initialize Hugging Face filesystem
-    logger.info("Identifying files to download from HuggingFace...")
-    hf_fs = HfFileSystem(token=os.environ.get("HF_TOKEN", False))
-    hf_source_path = _resolve_hf_source_path(cfg)
-    _assert_bucket_support_available(hf_source_path)
-
-    if not cfg.hf_urls_glob:
-        # We get all the files using find
-        files = hf_fs.find(hf_source_path, revision=cfg.revision)
-    else:
-        # Get list of files directly from HfFileSystem matching the pattern
-        files = []
-        for hf_url_glob in cfg.hf_urls_glob:
-            pattern = os.path.join(hf_source_path, hf_url_glob)
-            files += hf_fs.glob(pattern, revision=cfg.revision)
-
-    if not files:
-        raise ValueError(f"No files found for dataset `{cfg.hf_dataset_id}. Used glob patterns: {cfg.hf_urls_glob}")
-
-    # Get file sizes for validation
-    logger.info("Getting file sizes for validation...")
-    file_sizes: dict[str, int | None] = {}
-    for file in files:
-        try:
-            info = hf_fs.info(file, revision=cfg.revision)
-            file_sizes[file] = info.get("size") or None
-        except Exception as e:
-            logger.warning(f"Could not get size for {file}: {e}")
-            file_sizes[file] = None  # Will skip validation for this file
-
-    download_tasks = []
-
-    for file in files:
-        try:
-            relative_file_path = _relative_path_in_source(file, hf_source_path)
-            if relative_file_path.startswith(".."):
-                raise ValueError(f"Computed path escapes source root: source={hf_source_path}, file={file}")
-            fsspec_file_path = os.path.join(output_path, relative_file_path)
-            expected_size = file_sizes.get(file)
-            download_tasks.append(
-                (
-                    output_path,
-                    file,
-                    fsspec_file_path,
-                    expected_size,
-                    cfg.read_timeout_seconds,
-                    cfg.progress_log_interval_seconds,
-                    cfg.read_chunk_size_mib,
-                )
-            )
-        except Exception as e:
-            logging.exception(f"Error preparing task for {file}: {e}")
-
-    total_files = len(download_tasks)
-    total_size_gb = sum(s for s in file_sizes.values() if s is not None) / (1024**3)
-    logger.info(f"Total number of files to process: {total_files} ({total_size_gb:.2f} GB)")
-
-    pipeline = (
-        Dataset.from_list(download_tasks)
-        .map(lambda task: stream_file_to_fsspec(*task))
-        .write_jsonl(
-            f"{cfg.gcs_output_path}/.metrics/success-part-{{shard:05d}}-of-{{total:05d}}.jsonl", skip_existing=True
-        )
-    )
-    ctx = ZephyrContext(name="download-hf", max_workers=cfg.zephyr_max_parallelism)
-    ctx.execute(pipeline)
-
-    # Write Provenance JSON
-    write_provenance_json(
-        output_path,
-        metadata={"dataset": cfg.hf_dataset_id, "version": cfg.revision, "links": files},
-    )
-
-    logger.info(f"Streamed all files and wrote provenance JSON; check {output_path}.")
-
-
-@draccus.wrap()
-def main(cfg: DownloadConfig) -> None:
-    """Download HuggingFace dataset."""
-    download_hf(cfg)
-
+from marin.datakit.download.huggingface import DownloadConfig as DownloadConfig
+from marin.datakit.download.huggingface import download_hf as download_hf
+from marin.datakit.download.huggingface import ensure_fsspec_path_writable as ensure_fsspec_path_writable
+from marin.datakit.download.huggingface import main as main
+from marin.datakit.download.huggingface import stream_file_to_fsspec as stream_file_to_fsspec
 
 if __name__ == "__main__":
     main()
diff --git a/lib/marin/src/marin/download/huggingface/stream_remove_columns.py b/lib/marin/src/marin/download/huggingface/stream_remove_columns.py
index b16e3a1f1b..6d5d39f492 100644
--- a/lib/marin/src/marin/download/huggingface/stream_remove_columns.py
+++ b/lib/marin/src/marin/download/huggingface/stream_remove_columns.py
@@ -1,101 +1,9 @@
 # Copyright The Marin Authors
 # SPDX-License-Identifier: Apache-2.0
+# Backward-compat shim. Canonical location: marin.datakit.download.stream_remove_columns
 
-"""Remove unnecessary columns while streaming data from huggingface."""
-
-import logging
-import os
-from dataclasses import dataclass
-
-import pandas as pd
-import pyarrow.parquet as pq
-from huggingface_hub import HfFileSystem
-from tqdm import tqdm
-from zephyr import Dataset, ZephyrContext
-
-hf_fs = HfFileSystem()
-logger = logging.getLogger(__name__)
-
-
-def prune_stream_and_save(input_file: str, output_file: str, keep_columns: list[str]):
-    """
-    Prunes and saves a parquet file by removing un-specified columns.
-
-    Reads the input parquet file in batches, removes columns not in keep_columns,
-    and writes the result to output_file. Processing in batches avoids memory issues.
-
-    Args:
-        input_file (str): Path to input parquet file on HuggingFace
-        output_file (str): Path where pruned parquet file will be saved
-        keep_columns (list[str]): List of column names to retain
-    """
-    parquet_file = pq.ParquetFile(hf_fs.open(input_file))
-
-    full_df_list = []
-    for batch in tqdm(parquet_file.iter_batches(batch_size=10000), desc=f"Processing {input_file}"):
-        df = batch.to_pandas()
-
-        drop_columns = [col for col in df.columns if col not in keep_columns]
-        df = df.drop(columns=drop_columns)
-
-        full_df_list.append(df)
-
-    full_df = pd.concat(full_df_list)
-    logger.info(f"Saving pruned dataset of shape {full_df.shape} to {output_file}")
-    full_df.to_parquet(output_file, index=False)
-
-
-def get_file_tasks(hf_path: str, output_path: str, keep_columns: list[str]):
-    """
-    Generate file processing tasks for a HuggingFace subset.
-
-    Args:
-        hf_path (str): The HuggingFace dataset path to load
-        output_path (str): The output path to save the pruned dataset
-        keep_columns (list[str]): The columns to keep in the pruned dataset
-
-    Yields:
-        Dict with input_file, output_file, and keep_columns for each parquet file
-    """
-    logger.info(f"Loading dataset from {hf_path}")
-    parquet_list = hf_fs.glob(f"{hf_path}/*.parquet")
-
-    for file in parquet_list:
-        output_file = os.path.join(output_path, os.path.basename(file))
-        yield {"input_file": file, "output_file": output_file, "keep_columns": keep_columns}
-
-
-@dataclass
-class DatasetConfig:
-    hf_repo_id: str
-    hf_revision: str
-    hf_paths: list[str]
-    output_path: str
-    keep_columns: list[str]
-
-
-def prune_hf_dataset(cfg: DatasetConfig):
-    logger.info(f"Starting dataset pruning for {cfg.hf_paths}")
-
-    # Build list of subset paths to process
-    subset_tasks = []
-    for path in cfg.hf_paths:
-        # HF Path form: hf://[<repo_type_prefix>]<repo_id>[@<revision>]/<path/in/repo>
-        hf_path = f"hf://datasets/{cfg.hf_repo_id}@{cfg.hf_revision}/{path}"
-        logger.info(f"Processing subset {hf_path}")
-        output_path = os.path.join(cfg.output_path, path)
-        subset_tasks.append({"hf_path": hf_path, "output_path": output_path})
-
-    # Build pipeline with nested parallelism:
-    # - Outer level: process subsets (MAX_CONCURRENT_WORKERS=1)
-    # - Inner level: process files within each subset
-    pipeline = (
-        Dataset.from_list(subset_tasks)
-        .flat_map(lambda task: get_file_tasks(task["hf_path"], task["output_path"], cfg.keep_columns))
-        .map(lambda task: prune_stream_and_save(task["input_file"], task["output_file"], cfg.keep_columns))
-    )
-
-    logger.info("Executing pipeline")
-    ctx = ZephyrContext(name="hf-remove-columns")
-    ctx.execute(pipeline)
-    logger.info("Successfully processed all subsets")
+from marin.datakit.download.stream_remove_columns import DatasetConfig as DatasetConfig
+from marin.datakit.download.stream_remove_columns import get_file_tasks as get_file_tasks
+from marin.datakit.download.stream_remove_columns import hf_fs as hf_fs
+from marin.datakit.download.stream_remove_columns import prune_hf_dataset as prune_hf_dataset
+from marin.datakit.download.stream_remove_columns import prune_stream_and_save as prune_stream_and_save
diff --git a/lib/marin/src/marin/download/huggingface/upload_gcs_to_hf.py b/lib/marin/src/marin/download/huggingface/upload_gcs_to_hf.py
index 1aa580c618..43c368f5b9 100644
--- a/lib/marin/src/marin/download/huggingface/upload_gcs_to_hf.py
+++ b/lib/marin/src/marin/download/huggingface/upload_gcs_to_hf.py
@@ -1,364 +1,10 @@
 # Copyright The Marin Authors
 # SPDX-License-Identifier: Apache-2.0
+# Backward-compat shim. Canonical location: marin.datakit.download.upload_gcs_to_hf
 
-"""
-Upload GCS to Hugging Face (HF) Script
-
-This script transfers model checkpoints or other content from Google Cloud Storage (GCS)
-to Hugging Face repositories. It handles:
-- Finding checkpoint directories in GCS buckets
-- Downloading the content locally (to a temporary directory)
-- Uploading to a specified Hugging Face repository with appropriate versioning
-- Supporting dry-run mode to preview what would be uploaded
-
-Usage as a script:
-  python upload_gcs_to_hf.py --repo-id="organization/model-name" [--dry-run] [--directory="gs://bucket/path"]
-
-Usage as an ExecutorStep:
-  upload_step = ExecutorStep(
-      name="upload_model_to_hf",
-      fn=upload_gcs_to_hf,
-      config=UploadConfig(
-          hf_repo_id="organization/model-name",
-          gcs_directories=["gs://bucket/path/to/model"],
-          dry_run=False
-      )
-  )
-"""
-
-import argparse
-import logging
-import os
-import re
-import subprocess
-import tempfile
-from dataclasses import dataclass, field
-
-from google.cloud import storage
-from google.cloud.storage import transfer_manager
-from huggingface_hub import HfApi, create_repo
-from iris.logging import configure_logging
-
-# Set up logging
-logger = logging.getLogger(__name__)
-
-
-@dataclass
-class UploadConfig:
-    """Configuration for uploading from GCS to Hugging Face."""
-
-    hf_repo_id: str
-    gcs_directories: list[str] = field(default_factory=list)
-    dry_run: bool = False
-    wait_for_completion: bool = True  # Added for compatibility with other configs
-
-
-# Default GCS directories to check if none specified
-DEFAULT_GCS_DIRS = [
-    "gs://marin-eu-west4/checkpoints/llama-8b-tootsie-0.001-19ad63/hf/",
-    "gs://marin-us-central2/checkpoints/llama-8b-tootsie-phase2/hf/",
-    "gs://marin-us-central2/checkpoints/llama-8b-tootsie-phase3/hf/",
-    "gs://marin-us-central2/checkpoints/tootsie-8b-soft-raccoon-3/hf/",
-    "gs://marin-us-central2/checkpoints/llama-8b-tootsie-adept-phoenix/hf/",
-    "gs://marin-us-central2/checkpoints/tootsie-8b-sensible-starling/hf/",
-    "gs://marin-us-central1/checkpoints/tootsie-8b-deeper-starling/hf/",
-]
-
-
-def list_gcs_directories(gcs_path: str) -> list[tuple[str, int]]:
-    """List subdirectories by examining full blob paths."""
-    if not gcs_path.startswith("gs://"):
-        raise ValueError(f"Invalid GCS path: {gcs_path}")
-
-    path = gcs_path[5:]  # Remove "gs://"
-    bucket_name = path.split("/")[0]
-    prefix = "/".join(path.split("/")[1:])
-
-    logger.info(f"Checking: {gcs_path}")
-
-    # Get the bucket
-    client = storage.Client()
-    bucket = client.bucket(bucket_name)
-
-    # List blobs with this prefix (without delimiter to get all)
-    blobs = bucket.list_blobs(prefix=prefix)
-
-    # Extract potential directories from blob paths
-    directories = set()
-    step_pattern = re.compile(r"step-\d+")
-
-    for blob in blobs:
-        # Remove the prefix to get the relative path
-        relative_path = blob.name[len(prefix) :]
-
-        # Skip if there's no relative path
-        if not relative_path:
-            continue
-
-        # Extract the first directory level
-        parts = relative_path.strip("/").split("/")
-        if parts:
-            first_dir = parts[0]
-
-            # Check if it's a step directory
-            if step_pattern.match(first_dir):
-                directories.add(first_dir)
-
-    # Process the directories we found
-    step_dirs_local = []
-    for dir_name in directories:
-        if step_pattern.match(dir_name):
-            try:
-                step_number = int(dir_name.split("-")[1])
-                full_path = f"{gcs_path}{dir_name}/"
-                step_dirs_local.append((full_path, step_number))
-                logger.info(f"Found step directory: {full_path} with step {step_number}")
-            except (IndexError, ValueError) as e:
-                logger.error(f"Error parsing step number from {dir_name}: {e}")
-
-    logger.info(f"Found {len(step_dirs_local)} step directories in {gcs_path}")
-    return step_dirs_local
-
-
-def download_from_gcs(gcs_path: str, local_path: str) -> bool:
-    """Download contents from a GCS path to a local directory using the GCS transfer manager."""
-    logger.info(f"Downloading {gcs_path} to {local_path}...")
-
-    # Parse the GCS path (format: gs://bucket-name/path/to/files)
-    if not gcs_path.startswith("gs://"):
-        logger.error(f"Invalid GCS path format: {gcs_path}")
-        return False
-
-    bucket_name = gcs_path[5:].split("/")[0]
-    prefix = "/".join(gcs_path[5:].split("/")[1:])
-
-    # Handle wildcard at the end (the original had f"{gcs_path}*")
-    if prefix.endswith("*"):
-        prefix = prefix[:-1]
-
-    # Initialize the GCS client
-    client = storage.Client()
-    bucket = client.bucket(bucket_name)
-
-    # List all matching blobs
-    blobs = list(bucket.list_blobs(prefix=prefix))
-
-    if not blobs:
-        logger.error(f"No files found in {gcs_path}")
-        return False
-
-    total_files = len(blobs)
-    logger.info(f"Found {total_files} files to download from {gcs_path}")
-
-    # Get the blob names to download (excluding directory placeholders)
-    blob_names = []
-    for blob in blobs:
-        if not blob.name.endswith("/"):
-            blob_names.append(blob.name)
-
-    if len(blob_names) < total_files:
-        logger.info(f"Filtered out {total_files - len(blob_names)} directory markers")
-
-    # Ensure local directory exists
-    os.makedirs(local_path, exist_ok=True)
-
-    # Log the first few blob names to debug issues
-    if blob_names:
-        logger.info(f"Sample blob names (first 3): {', '.join(blob_names[:3])}")
-
-    # Use transfer manager to download all blobs in parallel
-    logger.info(f"Starting parallel download of {len(blob_names)} files...")
-
-    transfer_manager.download_many_to_path(
-        bucket=bucket,
-        blob_names=blob_names,
-        destination_directory=local_path,
-        max_workers=8,
-        create_directories=True,
-        worker_type="process",
-        raise_exception=True,
-    )
-
-    logger.info(f"Download completed successfully. Downloaded {len(blob_names)} files.")
-    return True
-
-
-def checkpoint_exists(repo_id: str, step: int, version_name: str) -> bool:
-    """Check if a specific revision exists in a Hugging Face repository."""
-    try:
-        api = HfApi()
-        commits = api.list_repo_commits(repo_id=repo_id)
-        for commit in commits:
-            if f"step {step}" in commit.title:
-                return True
-        return False
-    except Exception:
-        return False
-
-
-def extract_version_from_path(gcs_path: str) -> str:
-    """Extract the version name from a GCS path."""
-    # Extract model name from path like "gs://marin-eu-west4/checkpoints/llama-8b-tootsie-0.001-19ad63/hf/"
-    parts = gcs_path.strip("/").split("/")
-    return parts[-3]
-
-
-def upload_to_huggingface(local_path: str, repo_id: str, step: int, version_name: str) -> bool:
-    """Upload a local directory to Hugging Face as a specific revision."""
-    logger.info(f"Uploading checkpoint {version_name}, step {step} to Hugging Face")
-
-    # Check if repo exists, create if not
-    api = HfApi()
-    create_repo(repo_id=repo_id, exist_ok=True)
-    # Upload the directory
-    result = api.upload_folder(
-        folder_path=local_path,
-        repo_id=repo_id,
-        commit_message=f"Upload checkpoint for step {step} ({version_name})",
-    )
-    try:
-        api.delete_tag(repo_id=repo_id, tag=version_name)
-    except Exception:
-        logger.info("Creating tag for the first time")
-    api.create_tag(repo_id=repo_id, tag=version_name)
-    logger.info("Upload completed successfully.")
-    logger.info(f"Commit URL: {result.commit_url}")
-    return True
-
-
-def upload_gcs_to_hf(cfg: UploadConfig) -> None:
-    """Main function to upload model checkpoints from GCS to Hugging Face."""
-
-    configure_logging(level=logging.INFO)
-
-    # Collect all step directories
-    all_step_dirs = []
-
-    # Determine which directories to process
-    directories_to_process = cfg.gcs_directories if cfg.gcs_directories else DEFAULT_GCS_DIRS
-
-    # Process each directory
-    for directory in directories_to_process:
-        try:
-            step_dirs = list_gcs_directories(directory)
-            all_step_dirs.extend(step_dirs)
-        except Exception as e:
-            logger.error(f"Error listing {directory}: {e}")
-
-    # Sort all step directories by step number
-    if all_step_dirs:
-        all_step_dirs.sort(key=lambda x: x[1])
-
-        # Print sorted step directories
-        logger.info("\nAll step directories sorted by step number:")
-        logger.info("-" * 50)
-        for full_path, _step_number in all_step_dirs:
-            logger.info(f"- {full_path}")
-
-        logger.info(f"\nTotal: {len(all_step_dirs)} step directories")
-
-        # Upload to Hugging Face
-        if not cfg.dry_run:
-            logger.info(f"\nUploading to Hugging Face repo: {cfg.hf_repo_id}")
-
-            for full_path, step_number in all_step_dirs:
-                # Extract version name from the path
-                version_name = extract_version_from_path(full_path)
-
-                # Check if this checkpoint already exists
-                if checkpoint_exists(cfg.hf_repo_id, step_number, version_name):
-                    logger.info(
-                        f"Step {step_number} for {version_name} already exists in HF repo {cfg.hf_repo_id}, skipping"
-                    )
-                    continue
-
-                # Create a temporary directory for downloading
-                with tempfile.TemporaryDirectory() as temp_dir:
-                    logger.info(f"\nProcessing step {step_number} from {full_path} ({version_name})")
-
-                    # Download from GCS
-                    if download_from_gcs(full_path, temp_dir):
-                        # Upload to HF
-                        if upload_to_huggingface(temp_dir, cfg.hf_repo_id, step_number, version_name):
-                            logger.info(
-                                f"Successfully uploaded step {step_number} ({version_name}) to HF repo {cfg.hf_repo_id}"
-                            )
-                        else:
-                            logger.error(f"Failed to upload step {step_number}")
-                    else:
-                        logger.error(f"Failed to download step {step_number}")
-
-            logger.info("\nUpload process completed.")
-        else:
-            logger.info("\nDry run - showing what would be uploaded:")
-            logger.info("-" * 50)
-
-            for i, (full_path, step_number) in enumerate(all_step_dirs):
-                version_name = extract_version_from_path(full_path)
-                logger.info(f"\nCheckpoint {i + 1}/{len(all_step_dirs)}:")
-                logger.info(f"  Source: {full_path}")
-                logger.info(f"  Target repo: {cfg.hf_repo_id}")
-                logger.info(f"  Revision: {version_name}")
-                logger.info(f"  Commit message: Upload checkpoint for step {step_number} ({version_name})")
-
-                # Try to estimate what files would be uploaded
-                try:
-                    # Use gsutil to list files in the directory
-                    cmd = ["gsutil", "ls", f"{full_path}"]
-                    result = subprocess.run(cmd, capture_output=True, text=True)
-                    if result.returncode == 0:
-                        files = result.stdout.strip().split("\n")
-                        # Filter out empty strings and limit to 5 for display
-                        files = [f for f in files if f]
-
-                        if files:
-                            logger.info(
-                                f"  Example files that would be uploaded ({min(len(files), 5)} of {len(files)}):"
-                            )
-                            for file in files[:5]:
-                                logger.info(f"    - {os.path.basename(file)}")
-                            if len(files) > 5:
-                                logger.info(f"    - ... and {len(files) - 5} more")
-                except Exception as e:
-                    logger.error(f"  Could not list files: {e}")
-
-            logger.info("\nDry run completed - no actual uploads performed.")
-    else:
-        logger.warning("\nNo step directories found in any of the paths.")
-        logger.warning("You might want to check if:")
-        logger.warning("1. The paths are correct")
-        logger.warning("2. You have permissions to access these buckets")
-        logger.warning("3. There are step directories in these locations")
-
-
-def main():
-    """Command line entry point for direct script usage."""
-    parser = argparse.ArgumentParser(description="Upload checkpoints from GCS to Hugging Face")
-    parser.add_argument(
-        "--repo-id", required=True, help='Target Hugging Face repository ID (e.g., "username/model-name")'
-    )
-    parser.add_argument("--dry-run", action="store_true", help="Only list checkpoints without uploading")
-    parser.add_argument(
-        "--directories",
-        nargs="+",
-        help="Process specific GCS directories instead of the built-in list. Multiple directories can be provided.",
-    )
-    args = parser.parse_args()
-
-    # Create config from args
-    config = UploadConfig(
-        hf_repo_id=args.repo_id, gcs_directories=args.directories if args.directories else [], dry_run=args.dry_run
-    )
-
-    # Check if application default credentials are set
-    if "GOOGLE_APPLICATION_CREDENTIALS" not in os.environ:
-        logger.warning("Warning: GOOGLE_APPLICATION_CREDENTIALS environment variable not set.")
-        logger.warning("Make sure you're authenticated with Google Cloud before running this script.")
-        logger.warning("You can authenticate using: gcloud auth application-default login")
-
-    # Run the upload function
-    upload_gcs_to_hf(config)
-
+from marin.datakit.download.upload_gcs_to_hf import UploadConfig as UploadConfig
+from marin.datakit.download.upload_gcs_to_hf import main as main
+from marin.datakit.download.upload_gcs_to_hf import upload_gcs_to_hf as upload_gcs_to_hf
 
 if __name__ == "__main__":
     main()
diff --git a/lib/marin/src/marin/download/nemotron_cc/__init__.py b/lib/marin/src/marin/download/nemotron_cc/__init__.py
new file mode 100644
index 0000000000..ec8bc038b7
--- /dev/null
+++ b/lib/marin/src/marin/download/nemotron_cc/__init__.py
@@ -0,0 +1,2 @@
+# Copyright The Marin Authors
+# SPDX-License-Identifier: Apache-2.0
diff --git a/lib/marin/src/marin/download/nemotron_cc/download_nemotron_cc.py b/lib/marin/src/marin/download/nemotron_cc/download_nemotron_cc.py
index 77c9d82cf5..81251cb66c 100644
--- a/lib/marin/src/marin/download/nemotron_cc/download_nemotron_cc.py
+++ b/lib/marin/src/marin/download/nemotron_cc/download_nemotron_cc.py
@@ -1,119 +1,7 @@
 # Copyright The Marin Authors
 # SPDX-License-Identifier: Apache-2.0
+# Backward-compat shim. Canonical location: marin.datakit.download.nemotron_cc
 
-"""
-Download and process Nemotron-CC dataset from Common Crawl.
-
-Example Usage:
-uv run zephyr --backend=ray --max-parallelism=100 --memory=4GB \
-    lib/marin/src/marin/download/nemotron_cc/download_nemotron_cc.py \
-    --output_path gs://bucket/nemotron-output
-"""
-
-import json
-import logging
-import os
-from collections.abc import Iterator
-from dataclasses import dataclass
-
-import requests
-import zstandard
-from iris.marin_fs import open_url
-from marin.execution import THIS_OUTPUT_PATH
-from marin.utils import fsspec_exists
-from requests.adapters import HTTPAdapter
-from urllib3.util import Retry
-from zephyr import Dataset, ZephyrContext
-from zephyr.writers import atomic_rename
-
-logger = logging.getLogger(__name__)
-
-myagent = "marin-nemotron-ingress/1.0"
-NCC_PATH_FILE_URL = "https://data.commoncrawl.org/contrib/Nemotron/Nemotron-CC/data-jsonl.paths.gz"
-
-
-def _iter_jsonl_from_zstd_stream(raw_stream) -> Iterator[dict]:
-    """Yield parsed JSON objects from a zstd-compressed JSONL stream."""
-    dctx = zstandard.ZstdDecompressor()
-    with dctx.stream_reader(raw_stream) as reader:
-        buf = bytearray()
-        while True:
-            chunk = reader.read(1048576)
-            if not chunk:
-                break
-            buf.extend(chunk)
-            while True:
-                newline_pos = buf.find(b"\n")
-                if newline_pos < 0:
-                    break
-                line_bytes = bytes(buf[:newline_pos])
-                del buf[: newline_pos + 1]
-                if not line_bytes.strip():
-                    continue
-                yield json.loads(line_bytes)
-
-
-def download_single_nemotron_path(input_file_path: str, output_file_path: str) -> dict:
-    """Fetches content from a Common Crawl path, streaming records to zstd output."""
-    cc_url = f"https://data.commoncrawl.org/{input_file_path}"
-    logger.info(f"Downloading Nemotron CC file {cc_url} to {output_file_path}")
-
-    session = requests.Session()
-    retries = Retry(total=5, backoff_factor=1.0, status_forcelist=[500, 502, 503, 504], allowed_methods=["GET"])
-    adapter = HTTPAdapter(max_retries=retries)
-    session.mount("https://", adapter)
-    session.mount("http://", adapter)
-
-    response = session.get(cc_url, headers={"user-agent": myagent}, stream=True)
-    response.raise_for_status()
-
-    num_records = 0
-    with atomic_rename(output_file_path) as temp_path:
-        with open_url(temp_path, "w", compression="zstd") as out:
-            for record in _iter_jsonl_from_zstd_stream(response.raw):
-                dolma_record = {
-                    "id": record["warc_record_id"],
-                    "text": record["text"],
-                    "source": "nemotron",
-                    "format": "text",
-                    "metadata": {f"nemotron_{k}": v for k, v in record.items() if k not in ("warc_record_id", "text")},
-                }
-                print(json.dumps(dolma_record), file=out)
-                num_records += 1
-
-    return {"input_file": input_file_path, "output_file": output_file_path, "num_records": num_records}
-
-
-@dataclass
-class NemotronIngressConfig:
-    output_path: str = THIS_OUTPUT_PATH
-
-
-def download_nemotron_cc(cfg: NemotronIngressConfig):
-    paths_file_path = os.path.join(cfg.output_path, "data-jsonl.paths")
-    logger.info(f"Downloading Nemotron CC path file {paths_file_path}")
-
-    with open_url(NCC_PATH_FILE_URL, "rb") as f, open_url(paths_file_path, "wb") as f_out:
-        f_out.write(f.read())
-
-    logger.info(f"Reading paths from {paths_file_path}")
-    all_files = []
-    with open_url(paths_file_path, "r", compression="gzip") as f:
-        for line in f:
-            file = line.strip()
-            output_file_path = os.path.join(cfg.output_path, file).replace("jsonl.zstd", "jsonl.zst")
-            all_files.append((file, output_file_path))
-
-    logger.info(f"Processing {len(all_files)} Nemotron CC files")
-
-    pipeline = (
-        Dataset.from_list(all_files)
-        .filter(lambda file_info: not fsspec_exists(file_info[1]))
-        .map(lambda file_info: download_single_nemotron_path(*file_info))
-        .write_jsonl(os.path.join(cfg.output_path, ".metrics/download-{shard:05d}.jsonl"), skip_existing=True)
-    )
-
-    ctx = ZephyrContext(name="download-nemotron-cc")
-    ctx.execute(pipeline)
-
-    logger.info(f"Downloaded Nemotron CC files to {cfg.output_path}")
+from marin.datakit.download.nemotron_cc import NemotronIngressConfig as NemotronIngressConfig
+from marin.datakit.download.nemotron_cc import download_nemotron_cc as download_nemotron_cc
+from marin.datakit.download.nemotron_cc import download_single_nemotron_path as download_single_nemotron_path
diff --git a/lib/marin/src/marin/download/uncheatable_eval/__init__.py b/lib/marin/src/marin/download/uncheatable_eval/__init__.py
new file mode 100644
index 0000000000..ec8bc038b7
--- /dev/null
+++ b/lib/marin/src/marin/download/uncheatable_eval/__init__.py
@@ -0,0 +1,2 @@
+# Copyright The Marin Authors
+# SPDX-License-Identifier: Apache-2.0
diff --git a/lib/marin/src/marin/download/uncheatable_eval/download.py b/lib/marin/src/marin/download/uncheatable_eval/download.py
index b77195ed63..9baf9db8ad 100644
--- a/lib/marin/src/marin/download/uncheatable_eval/download.py
+++ b/lib/marin/src/marin/download/uncheatable_eval/download.py
@@ -1,394 +1,12 @@
 # Copyright The Marin Authors
 # SPDX-License-Identifier: Apache-2.0
+# Backward-compat shim. Canonical location: marin.datakit.download.uncheatable_eval
 
-"""Download and normalize the latest Uncheatable Eval data dumps."""
-
-from __future__ import annotations
-
-import json
-import logging
-import os
-import posixpath
-import re
-from collections.abc import Iterable
-from dataclasses import dataclass
-from typing import Any
-
-import requests
-from iris.marin_fs import open_url
-from marin.execution import THIS_OUTPUT_PATH, ExecutorStep, VersionedValue, ensure_versioned, this_output_path
-from marin.utils import fsspec_mkdirs
-from requests.adapters import HTTPAdapter
-from urllib3.util import Retry
-from zephyr import Dataset, ZephyrContext
-from zephyr.writers import atomic_rename
-
-logger = logging.getLogger(__name__)
-
-FILENAME_PATTERN = re.compile(r"^(?P<benchmark>.+)_(?P<start>\d{8})to(?P<end>\d{8})(?P<suffix>(?:\.[^.]+)*)$")
-
-TEXT_FIELD_CANDIDATES: tuple[str, ...] = (
-    "text",
-    "body",
-    "content",
-    "article",
-    "document",
-    "raw_text",
-    "code",
-    "message",
-    "description",
-    "story",
-)
-
-LIST_FIELD_CANDIDATES: tuple[str, ...] = (
-    "paragraphs",
-    "sentences",
-    "lines",
-    "messages",
+from marin.datakit.download.uncheatable_eval import UncheatableEvalDataset as UncheatableEvalDataset
+from marin.datakit.download.uncheatable_eval import (
+    UncheatableEvalDownloadConfig as UncheatableEvalDownloadConfig,
 )
-
-ID_FIELD_CANDIDATES: tuple[str, ...] = (
-    "id",
-    "uuid",
-    "guid",
-    "doc_id",
-    "document_id",
-    "article_id",
-    "hash",
-    "sha",
-    "uid",
+from marin.datakit.download.uncheatable_eval import (
+    download_latest_uncheatable_eval as download_latest_uncheatable_eval,
 )
-
-
-@dataclass(frozen=True)
-class UncheatableEvalDataset:
-    """Information about a single data dump file from the Uncheatable Eval repository."""
-
-    benchmark: str
-    start_date: str
-    end_date: str
-    name: str
-    download_url: str
-    sha: str | None = None
-    size: int | None = None
-
-    @property
-    def date_range(self) -> str:
-        return f"{self.start_date}to{self.end_date}"
-
-    @property
-    def source_label(self) -> str:
-        return f"{self.benchmark}:{self.date_range}"
-
-    def output_filename(self, suffix: str = ".jsonl.gz") -> str:
-        return f"{self.benchmark}_{self.date_range}{suffix}"
-
-
-@dataclass
-class UncheatableEvalDownloadConfig:
-    """Configuration for downloading and normalizing Uncheatable Eval dumps."""
-
-    output_path: str | VersionedValue[str] = THIS_OUTPUT_PATH
-    repo_owner: str | VersionedValue[str] = "Jellyfish042"
-    repo_name: str | VersionedValue[str] = "uncheatable_eval"
-    data_path: str | VersionedValue[str] = "data"
-    branch: str | VersionedValue[str] = "master"
-    max_concurrent_downloads: int = 8
-    request_timeout: int = 120
-    github_token: str | None = None
-    skip_existing: bool = True
-    metadata_filename: str = "metadata.json"
-
-
-def _http_headers(cfg: UncheatableEvalDownloadConfig) -> dict[str, str]:
-    headers = {"Accept": "application/vnd.github+json"}
-    token = cfg.github_token or os.environ.get("GITHUB_TOKEN")
-    if token:
-        headers["Authorization"] = f"Bearer {token}"
-    return headers
-
-
-def _fetch_directory_listing(cfg: UncheatableEvalDownloadConfig) -> list[dict[str, Any]]:
-    """Return the list of files in the configured GitHub repository directory."""
-
-    headers = _http_headers(cfg)
-    base_url = f"https://api.github.com/repos/{cfg.repo_owner!s}/{cfg.repo_name!s}/contents/{cfg.data_path!s}"
-    params = {"ref": str(cfg.branch)}
-    response = requests.get(base_url, headers=headers, params=params, timeout=cfg.request_timeout)
-    response.raise_for_status()
-    payload = response.json()
-    if not isinstance(payload, list):
-        raise ValueError(f"Unexpected response from GitHub API: {payload!r}")
-    return payload
-
-
-def _parse_available_dumps(entries: Iterable[dict[str, Any]]) -> list[UncheatableEvalDataset]:
-    """Parse GitHub directory entries into dataset metadata."""
-
-    datasets: list[UncheatableEvalDataset] = []
-    for entry in entries:
-        name = entry.get("name")
-        if not isinstance(name, str):
-            continue
-        match = FILENAME_PATTERN.match(name)
-        if not match:
-            continue
-        benchmark = match.group("benchmark")
-        start = match.group("start")
-        end = match.group("end")
-        download_url = entry.get("download_url")
-        if not isinstance(download_url, str):
-            logger.debug("Skipping %s because it has no download_url", name)
-            continue
-        datasets.append(
-            UncheatableEvalDataset(
-                benchmark=benchmark,
-                start_date=start,
-                end_date=end,
-                name=name,
-                download_url=download_url,
-                sha=entry.get("sha"),
-                size=entry.get("size"),
-            )
-        )
-    return datasets
-
-
-def _select_latest_dumps(datasets: Iterable[UncheatableEvalDataset]) -> list[UncheatableEvalDataset]:
-    """Select the latest dump for each benchmark based on the end date (and start date as tie breaker)."""
-
-    latest: dict[str, UncheatableEvalDataset] = {}
-    for dataset in datasets:
-        existing = latest.get(dataset.benchmark)
-        if existing is None:
-            latest[dataset.benchmark] = dataset
-            continue
-        candidate_key = (dataset.end_date, dataset.start_date, dataset.name)
-        existing_key = (existing.end_date, existing.start_date, existing.name)
-        if candidate_key > existing_key:
-            latest[dataset.benchmark] = dataset
-    return sorted(latest.values(), key=lambda d: d.benchmark)
-
-
-def _extract_id(raw: Any, dataset: UncheatableEvalDataset, index: int) -> str:
-    if isinstance(raw, dict):
-        for key in ID_FIELD_CANDIDATES:
-            value = raw.get(key)
-            if value:
-                return str(value)
-        metadata = raw.get("metadata")
-        if isinstance(metadata, dict):
-            for key in ID_FIELD_CANDIDATES:
-                value = metadata.get(key)
-                if value:
-                    return str(value)
-    return f"{dataset.benchmark}_{dataset.date_range}_{index:06d}"
-
-
-def _join_list_field(value: Any) -> str | None:
-    if isinstance(value, list):
-        text_items = [str(item) for item in value if item is not None]
-        if text_items:
-            return "\n".join(text_items)
-    return None
-
-
-def _extract_text(raw: Any) -> str | None:
-    if raw is None:
-        return None
-    if isinstance(raw, str):
-        return raw
-    if isinstance(raw, dict):
-        for key in TEXT_FIELD_CANDIDATES:
-            value = raw.get(key)
-            if isinstance(value, str) and value.strip():
-                return value
-        for key in TEXT_FIELD_CANDIDATES:
-            value = raw.get(key)
-            joined = _join_list_field(value)
-            if joined:
-                return joined
-        for key in LIST_FIELD_CANDIDATES:
-            joined = _join_list_field(raw.get(key))
-            if joined:
-                return joined
-        title = raw.get("title")
-        body = raw.get("body")
-        if isinstance(title, str) and isinstance(body, str):
-            combined = f"{title.strip()}\n\n{body.strip()}"
-            if combined.strip():
-                return combined
-        if isinstance(title, str) and title.strip():
-            return title
-        return json.dumps(raw, ensure_ascii=False)
-    return str(raw)
-
-
-def _normalize_record(raw: Any, dataset: UncheatableEvalDataset, index: int) -> dict[str, str]:
-    text = _extract_text(raw)
-    if text is None or not str(text).strip():
-        raise ValueError(f"Record {index} in {dataset.name} does not contain text")
-    record_id = _extract_id(raw, dataset, index)
-    return {"id": record_id, "text": text, "source": dataset.source_label}
-
-
-def _download_and_convert_single(
-    task: DownloadTask,
-) -> dict[str, Any]:
-    session = requests.Session()
-    retries = Retry(total=5, backoff_factor=1.0, status_forcelist=[500, 502, 503, 504], allowed_methods=["GET"])
-    adapter = HTTPAdapter(max_retries=retries)
-    session.mount("https://", adapter)
-    session.mount("http://", adapter)
-
-    logger.info("Downloading %s from %s", task.dataset.name, task.download_url)
-    response = session.get(task.download_url, timeout=task.cfg.request_timeout, headers=_http_headers(task.cfg))
-    response.raise_for_status()
-
-    try:
-        payload = response.json()
-    except ValueError as exc:
-        raise ValueError(f"Failed to decode JSON payload for {task.dataset.name}") from exc
-
-    if not isinstance(payload, list):
-        raise ValueError(f"Expected list in dataset {task.dataset.name}, found {type(payload).__name__}")
-
-    fsspec_mkdirs(os.path.dirname(task.output_file_path), exist_ok=True)
-
-    record_count = 0
-    with atomic_rename(task.output_file_path) as temp_path:
-        with open_url(temp_path, "wt", encoding="utf-8", compression="gzip") as outfile:
-            for index, raw in enumerate(payload):
-                normalized = _normalize_record(raw, task.dataset, index)
-                json.dump(normalized, outfile, ensure_ascii=False)
-                outfile.write("\n")
-                record_count += 1
-
-    logger.info("Wrote %s records to %s", record_count, task.output_file_path)
-    return {"records": record_count, "output_file": task.output_file_path}
-
-
-@dataclass
-class DownloadTask:
-    download_url: str
-    output_file_path: str
-    dataset: UncheatableEvalDataset
-    cfg: UncheatableEvalDownloadConfig
-
-
-def _generate_tasks(
-    datasets: Iterable[UncheatableEvalDataset],
-    cfg: UncheatableEvalDownloadConfig,
-) -> tuple[list[DownloadTask], list[UncheatableEvalDataset]]:
-    tasks: list[DownloadTask] = []
-    filtered: list[UncheatableEvalDataset] = []
-    for dataset in datasets:
-        output_file = posixpath.join(str(cfg.output_path), dataset.output_filename())
-        tasks.append(DownloadTask(dataset.download_url, output_file, dataset, cfg))
-        filtered.append(dataset)
-    return tasks, filtered
-
-
-def _write_metadata(cfg: UncheatableEvalDownloadConfig, records: list[dict[str, Any]]) -> None:
-    if not records:
-        return
-    metadata_path = posixpath.join(str(cfg.output_path), cfg.metadata_filename)
-    with open_url(metadata_path, "w", encoding="utf-8") as meta_file:
-        json.dump(records, meta_file, indent=2, ensure_ascii=False)
-    logger.info("Wrote metadata to %s", metadata_path)
-
-
-def download_latest_uncheatable_eval(cfg: UncheatableEvalDownloadConfig) -> dict[str, Any]:
-    """Download and normalize the newest Uncheatable Eval dump for each benchmark."""
-
-    entries = _fetch_directory_listing(cfg)
-    datasets = _parse_available_dumps(entries)
-    latest_datasets = _select_latest_dumps(datasets)
-
-    if not latest_datasets:
-        logger.warning("No datasets found that match the expected naming pattern")
-        return {"success": False, "reason": "no_datasets"}
-
-    output_path = str(cfg.output_path)
-    fsspec_mkdirs(output_path, exist_ok=True)
-
-    tasks, filtered_datasets = _generate_tasks(latest_datasets, cfg)
-
-    if not tasks:
-        logger.info("No new datasets to process")
-        return {"success": True, "reason": "already_processed", "skipped": True}
-
-    metadata_records: list[dict[str, Any]] = []
-
-    pipeline = (
-        Dataset.from_list(tasks)
-        .map(lambda task: _download_and_convert_single(task))
-        .write_jsonl(f"{cfg.output_path}/.metrics/part-{{shard:05d}}.jsonl", skip_existing=True)
-    )
-    ctx = ZephyrContext(name="download-uncheatable-eval")
-    output_paths = ctx.execute(pipeline)
-
-    for dataset, metadata_file in zip(filtered_datasets, output_paths, strict=True):
-        with open_url(metadata_file, "r", encoding="utf-8") as meta_file:
-            result = json.load(meta_file)
-
-        try:
-            metadata_records.append(
-                {
-                    "benchmark": dataset.benchmark,
-                    "start_date": dataset.start_date,
-                    "end_date": dataset.end_date,
-                    "source": dataset.source_label,
-                    "output_file": posixpath.join(output_path, dataset.output_filename()),
-                    "records": result.get("records"),
-                    "sha": dataset.sha,
-                    "size": dataset.size,
-                }
-            )
-        except Exception:
-            logger.exception("Failed to process dataset %s", dataset.name)
-            raise
-
-    _write_metadata(cfg, metadata_records)
-    return {"success": True, "processed": metadata_records}
-
-
-def make_uncheatable_eval_step(
-    *,
-    name: str = "raw/uncheatable-eval/latest",
-    repo_owner: str = "ziqing-huang",
-    repo_name: str = "uncheatable_eval",
-    data_path: str = "data",
-    branch: str = "master",
-    max_concurrent_downloads: int = 8,
-    request_timeout: int = 120,
-    github_token: str | None = None,
-    skip_existing: bool = True,
-) -> ExecutorStep[UncheatableEvalDownloadConfig]:
-    """Create an :class:`ExecutorStep` that downloads the latest Uncheatable Eval dumps."""
-
-    config = UncheatableEvalDownloadConfig(
-        output_path=this_output_path(),
-        repo_owner=ensure_versioned(repo_owner),
-        repo_name=ensure_versioned(repo_name),
-        data_path=ensure_versioned(data_path),
-        branch=ensure_versioned(branch),
-        max_concurrent_downloads=max_concurrent_downloads,
-        request_timeout=request_timeout,
-        github_token=github_token,
-        skip_existing=skip_existing,
-    )
-
-    return ExecutorStep(
-        name=name,
-        fn=download_latest_uncheatable_eval,
-        config=config,
-    )
-
-
-__all__ = [
-    "UncheatableEvalDataset",
-    "UncheatableEvalDownloadConfig",
-    "download_latest_uncheatable_eval",
-    "make_uncheatable_eval_step",
-]
+from marin.datakit.download.uncheatable_eval import make_uncheatable_eval_step as make_uncheatable_eval_step
diff --git a/lib/marin/src/marin/download/wikipedia/__init__.py b/lib/marin/src/marin/download/wikipedia/__init__.py
new file mode 100644
index 0000000000..ec8bc038b7
--- /dev/null
+++ b/lib/marin/src/marin/download/wikipedia/__init__.py
@@ -0,0 +1,2 @@
+# Copyright The Marin Authors
+# SPDX-License-Identifier: Apache-2.0
diff --git a/lib/marin/src/marin/download/wikipedia/download.py b/lib/marin/src/marin/download/wikipedia/download.py
index 552e546bf9..9b50143040 100644
--- a/lib/marin/src/marin/download/wikipedia/download.py
+++ b/lib/marin/src/marin/download/wikipedia/download.py
@@ -1,125 +1,7 @@
 # Copyright The Marin Authors
 # SPDX-License-Identifier: Apache-2.0
+# Backward-compat shim. Canonical location: marin.datakit.download.wikipedia
 
-"""
-wikipedia/download.py
-
-Download script for the Wikipedia raw HTML data, provided by Wikimedia.
-
-Home Page: https://dumps.wikimedia.org/other/enterprise_html/runs/
-
-Example Usage (production, large dataset):
-ENWIKI=https://dumps.wikimedia.org/other/enterprise_html/runs/20250320/enwiki-NS0-20250320-ENTERPRISE-HTML.json.tar.gz
-uv run zephyr --backend=ray --max-parallelism=10 \
-    lib/marin/src/marin/download/wikipedia/download.py \
-    --input_urls $ENWIKI \
-    --revision 20250320 --output_path gs://path/to/output
-
-Example Usage (local testing, small dataset):
-SIMPLEWIKI=https://dumps.wikimedia.org/other/enterprise_html/runs/20250320/simplewiki-NS0-20250320-ENTERPRISE-HTML.json.tar.gz
-uv run zephyr --backend=threadpool --max-parallelism=4 --entry-point=download \
-    lib/marin/src/marin/download/wikipedia/download.py \
-    --input_urls "[$SIMPLEWIKI]" \
-    --revision 20250320 --output_path /tmp/wikipedia_test
-
-Note: The enwiki-NS0 file (English Wikipedia, namespace 0 = articles) is approximately 130 GB compressed.
-      The simplewiki-NS0 file (Simple English Wikipedia) is much smaller at ~2 GB compressed.
-"""
-
-import logging
-import os
-import tarfile
-from collections.abc import Iterable
-from dataclasses import dataclass
-
-import draccus
-import requests
-from iris.marin_fs import open_url
-from marin.utils import fsspec_size
-from tqdm_loggable.auto import tqdm
-from zephyr import Dataset, ZephyrContext, atomic_rename, load_jsonl
-
-logger = logging.getLogger(__name__)
-
-
-@dataclass
-class DownloadConfig:
-    input_urls: list[str]
-    revision: str
-    output_path: str
-
-
-def download_tar(url: str, output_prefix) -> str:
-    shard_filename = url.split("/")[-1]
-    output_filename = os.path.join(output_prefix, shard_filename)
-    logger.info(f"Downloading URL: {url} to {output_filename}")
-
-    try:
-        total_size = fsspec_size(url)
-        pbar = tqdm(total=total_size, desc="Downloading File", unit="B", unit_scale=True)
-
-        with atomic_rename(output_filename) as tmp_filename, open_url(tmp_filename, "wb") as f:
-            r = requests.get(url, stream=True)
-
-            for chunk in r.raw.stream(20 * 1024 * 1024, decode_content=False):
-                if chunk:
-                    f.write(chunk)
-                    f.flush()
-
-                    pbar.update(len(chunk))
-
-        return output_filename
-    except Exception as e:
-        logger.error(f"Error downloading URL: {url}")
-        raise e
-
-
-def process_file(input_file: str, output_path: str) -> Iterable[str]:
-    logger.info(f"Processing file: {input_file}")
-    logger.info(f"Output path: {output_path}")
-
-    try:
-        with open_url(input_file) as f:
-            with tarfile.open(fileobj=f, mode="r:gz") as tr:
-                for info in tr:
-                    with tr.extractfile(info) as file:
-                        file_content = file.read()
-                        file_path = os.path.join(output_path, info.name + ".gz")
-
-                    # Each file is a .ndjson file, which contains about 18k-21k articles
-                    # per file with size ranging from 200MB to 300MB
-                    with (
-                        atomic_rename(file_path) as tmpfile_path,
-                        open_url(tmpfile_path, "wb", compression="gzip") as output_f,
-                    ):
-                        output_f.write(file_content)
-                        yield file_path
-
-    except Exception as e:
-        logger.error(f"Error processing file: {input_file}")
-        raise e
-
-
-@draccus.wrap()
-def download(cfg: DownloadConfig) -> None:
-    """Download and process Wikipedia data."""
-    logger.info("Starting transfer of Wikipedia dump...")
-    output_base = os.path.join(cfg.output_path, cfg.revision)
-
-    ctx = ZephyrContext(name="download-wikipedia")
-    download_metrics = ctx.execute(
-        Dataset.from_list(cfg.input_urls)
-        .map(lambda url: download_tar(url, output_base))
-        .write_jsonl(f"{output_base}/.metrics/download-{{shard:05d}}.jsonl", skip_existing=True),
-    )
-
-    # load all of the output filenames to process
-    downloads = ctx.execute(Dataset.from_list(download_metrics).flat_map(load_jsonl))
-
-    extracted = ctx.execute(
-        Dataset.from_list(downloads)
-        .flat_map(lambda file: process_file(file, output_base))
-        .write_jsonl(f"{output_base}/.metrics/process-{{shard:05d}}.jsonl", skip_existing=True),
-    )
-
-    logger.info("Wikipedia dump transfer complete, wrote: %s", list(extracted))
+from marin.datakit.download.wikipedia import download as download
+from marin.datakit.download.wikipedia import download_tar as download_tar
+from marin.datakit.download.wikipedia import process_file as process_file

From 23bf3c4bb9a91697109a22998b9b8ad59edb23f5 Mon Sep 17 00:00:00 2001
From: Rafal Wojdyla <ravwojdyla@gmail.com>
Date: Wed, 25 Mar 2026 10:16:34 -0700
Subject: [PATCH 07/56] Extract pretraining download definitions into
 datakit/download/pretraining.py

Creates canonical StepSpec factory functions for all pretraining dataset
downloads (fineweb, dclm, slimpajama, etc.) in pretraining.py. Updates
simple.py to import from there and build the backward-compat downloads
dict via _build_downloads().

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 experiments/pretraining_datasets/simple.py    | 195 +++++-------------
 .../src/marin/datakit/download/pretraining.py | 119 +++++++++++
 2 files changed, 167 insertions(+), 147 deletions(-)
 create mode 100644 lib/marin/src/marin/datakit/download/pretraining.py

diff --git a/experiments/pretraining_datasets/simple.py b/experiments/pretraining_datasets/simple.py
index 7f51364735..79910f3741 100644
--- a/experiments/pretraining_datasets/simple.py
+++ b/experiments/pretraining_datasets/simple.py
@@ -12,8 +12,20 @@
 
 from levanter.data.text import TextLmDatasetFormat
 from levanter.store.cache import CacheOptions
-from marin.download.huggingface.download_hf import DownloadConfig, download_hf
-from marin.execution.executor import ExecutorStep, this_output_path, versioned
+from marin.datakit.download.pretraining import (
+    dclm_baseline_download,
+    dclm_baseline_wrong_download,
+    dolma3_mix_150b_1025_download,
+    fineweb_download,
+    fineweb_edu_download,
+    proofpile_2_download,
+    slimpajama_6b_download,
+    slimpajama_download,
+    starcoderdata_download,
+    the_pile_openwebtext2_download,
+    the_stack_dedup_download,
+)
+from marin.execution.executor import ExecutorStep, InputName, this_output_path, versioned
 from marin.processing.tokenize import TokenizeConfig, tokenize
 
 from experiments.llama import llama3_tokenizer
@@ -25,7 +37,7 @@
 
 def _tokenize_simple(
     name: str,
-    raw_dataset: ExecutorStep,
+    raw_dataset: ExecutorStep | InputName,
     tokenizer: str | None = None,
     override_path: str | None = None,
     text_format: TextLmDatasetFormat = TextLmDatasetFormat(),
@@ -57,153 +69,42 @@ def _tokenize_simple(
 # RAW DATASET DOWNLOADS
 # ============================================================================
 
-downloads = {
-    "fineweb": ExecutorStep(
-        name="raw/fineweb",
-        fn=download_hf,
-        config=DownloadConfig(
-            hf_dataset_id="HuggingFaceFW/fineweb",
-            revision="cd85054",
-            gcs_output_path=this_output_path(),
-            wait_for_completion=True,
+
+def _build_downloads() -> dict[str, ExecutorStep | InputName]:
+    """Build the downloads dict from canonical StepSpec definitions in pretraining.py."""
+    fineweb_edu_base = fineweb_edu_download().as_executor_step()
+
+    return {
+        "fineweb": fineweb_download().as_executor_step(),
+        "fineweb_edu": fineweb_edu_base.cd("data"),
+        "fineweb_edu_sample_10bt": fineweb_edu_base.cd("sample/10BT"),
+        "fineweb_edu_sample_100bt": fineweb_edu_base.cd("sample/100BT"),
+        "fineweb_edu_sample_350bt": fineweb_edu_base.cd("sample/350BT"),
+        "slimpajama": (
+            slimpajama_download()
+            .as_executor_step()
+            .cd("2d0accd/huggingface.co/datasets/cerebras/SlimPajama-627B/resolve/2d0accd")
         ),
-        override_output_path="raw/fineweb",
-    ),
-    "fineweb_edu": (
-        (
-            fineweb_edu_base_step := ExecutorStep(
-                name="raw/fineweb-edu",
-                fn=download_hf,
-                config=DownloadConfig(
-                    hf_dataset_id="HuggingFaceFW/fineweb-edu",
-                    revision=versioned((revision := "87f0914")),
-                    gcs_output_path=this_output_path(),
-                    wait_for_completion=True,
-                ),
-                override_output_path=f"raw/fineweb-edu-{revision}",
-            )
-        ).cd("data")
-    ),
-    "fineweb_edu_sample_10bt": fineweb_edu_base_step.cd("sample/10BT"),
-    "fineweb_edu_sample_100bt": fineweb_edu_base_step.cd("sample/100BT"),
-    "fineweb_edu_sample_350bt": fineweb_edu_base_step.cd("sample/350BT"),
-    "slimpajama": (
-        ExecutorStep(
-            name="raw/SlimPajama-627B",
-            fn=download_hf,
-            config=DownloadConfig(
-                hf_dataset_id="cerebras/SlimPajama-627B",
-                revision="2d0accd",
-                gcs_output_path=this_output_path(),
-                wait_for_completion=True,
-            ),
-            override_output_path="raw/SlimPajama-627B-262830",
-        ).cd("2d0accd/huggingface.co/datasets/cerebras/SlimPajama-627B/resolve/2d0accd")
-    ),
-    "slimpajama_6b": (
-        ExecutorStep(
-            name="raw/SlimPajama-6B",
-            fn=download_hf,
-            config=DownloadConfig(
-                hf_dataset_id="DKYoon/SlimPajama-6B",
-                revision="b5f90f4",
-                gcs_output_path=this_output_path(),
-                wait_for_completion=True,
-            ),
-            override_output_path="raw/SlimPajama-6B-be35b7",
-        ).cd("data")
-    ),
-    "dolma3_mix_150b_1025": (
-        ExecutorStep(
-            name="raw/dolma3_mix-150B-1025",
-            fn=download_hf,
-            config=DownloadConfig(
-                hf_dataset_id="allenai/dolma3_mix-150B-1025",
-                revision="15d04ee",
-                gcs_output_path=this_output_path(),
-                wait_for_completion=True,
-                append_sha_to_path=True,
-            ),
-            override_output_path="raw/dolma3_mix-150B-1025-15d04ee",
-        ).cd("15d04ee")
-    ),
-    "dclm_baseline_wrong": ExecutorStep(
-        name="raw/dclm-baseline-1.0",
-        fn=download_hf,
-        config=DownloadConfig(
-            hf_dataset_id="mlfoundations/dclm-baseline-1.0",
-            revision="a3b142c",
-            gcs_output_path=this_output_path(),
-            wait_for_completion=True,
+        "slimpajama_6b": slimpajama_6b_download().as_executor_step().cd("data"),
+        "dolma3_mix_150b_1025": dolma3_mix_150b_1025_download().as_executor_step().cd("15d04ee"),
+        "dclm_baseline_wrong": dclm_baseline_wrong_download().as_executor_step(),
+        "dclm_baseline": dclm_baseline_download().as_executor_step().cd("a3b142c"),
+        "the_stack_dedup": the_stack_dedup_download().as_executor_step().cd("17cad72"),
+        "proofpile_2": (
+            proofpile_2_download()
+            .as_executor_step()
+            .cd("901a927/huggingface.co/datasets/EleutherAI/proof-pile-2/resolve/901a927")
         ),
-        override_output_path="raw/dclm_WRONG_20250211/",
-    ),
-    "dclm_baseline": (
-        ExecutorStep(
-            name="raw/dclm-baseline-1.0",
-            fn=download_hf,
-            config=DownloadConfig(
-                hf_dataset_id="mlfoundations/dclm-baseline-1.0",
-                revision="a3b142c",
-                gcs_output_path=this_output_path(),
-                wait_for_completion=True,
-            ),
-            override_output_path="raw/dclm",
-        ).cd("a3b142c")
-    ),
-    "the_stack_dedup": (
-        ExecutorStep(
-            name="raw/the-stack-dedup",
-            fn=download_hf,
-            config=DownloadConfig(
-                hf_dataset_id="bigcode/the-stack-dedup",
-                revision="17cad72",
-                gcs_output_path=this_output_path(),
-                wait_for_completion=True,
-            ),
-            override_output_path="raw/the-stack-dedup-4ba450",
-        ).cd("17cad72")
-    ),
-    "proofpile_2": (
-        ExecutorStep(
-            name="raw/proof-pile-2",
-            fn=download_hf,
-            config=DownloadConfig(
-                hf_dataset_id="EleutherAI/proof-pile-2",
-                revision="901a927",
-                gcs_output_path=this_output_path(),
-                wait_for_completion=True,
-            ),
-            override_output_path="raw/proof-pile-2-f1b1d8",
-        ).cd("901a927/huggingface.co/datasets/EleutherAI/proof-pile-2/resolve/901a927")
-    ),
-    "the_pile_openwebtext2": (
-        ExecutorStep(
-            name="raw/the_pile_openwebtext2",
-            fn=download_hf,
-            config=DownloadConfig(
-                hf_dataset_id="vietgpt/the_pile_openwebtext2",
-                revision="1de27c6",
-                gcs_output_path=this_output_path(),
-                wait_for_completion=True,
-            ),
-            override_output_path="raw/the_pile_openwebtext2",
-        ).cd("1de27c6/huggingface.co/datasets/vietgpt/the_pile_openwebtext2/resolve/1de27c6")
-    ),
-    # TODO: Earlier datasets were stored in gcs_output_path/<revision> instead of gcs_output_path.
-    #   Migrate the dataset and cd can be removed.
-    "starcoderdata": ExecutorStep(
-        name="raw/starcoderdata",
-        fn=download_hf,
-        config=DownloadConfig(
-            hf_dataset_id="bigcode/starcoderdata",
-            revision="9fc30b5",
-            gcs_output_path=this_output_path(),
-            wait_for_completion=True,
+        "the_pile_openwebtext2": (
+            the_pile_openwebtext2_download()
+            .as_executor_step()
+            .cd("1de27c6/huggingface.co/datasets/vietgpt/the_pile_openwebtext2/resolve/1de27c6")
         ),
-        override_output_path="raw/starcoderdata-720c8c",
-    ),
-}
+        "starcoderdata": starcoderdata_download().as_executor_step(),
+    }
+
+
+downloads = _build_downloads()
 
 
 # ============================================================================
diff --git a/lib/marin/src/marin/datakit/download/pretraining.py b/lib/marin/src/marin/datakit/download/pretraining.py
new file mode 100644
index 0000000000..3300820ba3
--- /dev/null
+++ b/lib/marin/src/marin/datakit/download/pretraining.py
@@ -0,0 +1,119 @@
+# Copyright The Marin Authors
+# SPDX-License-Identifier: Apache-2.0
+
+"""Pre-defined download steps for common pretraining datasets.
+
+Each function returns a StepSpec for downloading a specific dataset from
+HuggingFace. These are the canonical definitions — experiments should
+import from here rather than defining download steps inline.
+
+For datasets where the actual data lives in a subdirectory of the download
+(e.g. fineweb-edu has data under ``data/``), the function returns the
+StepSpec for the base download. Consumers that need the subdirectory path
+should use ``step.output_path + "/data"`` or convert to ExecutorStep and
+use ``.cd("data")``.
+"""
+
+from marin.datakit.download.huggingface import download_hf_step
+from marin.execution.step_spec import StepSpec
+
+
+def fineweb_download() -> StepSpec:
+    return download_hf_step(
+        "raw/fineweb",
+        hf_dataset_id="HuggingFaceFW/fineweb",
+        revision="cd85054",
+        override_output_path="raw/fineweb",
+    )
+
+
+def fineweb_edu_download() -> StepSpec:
+    """Base download for fineweb-edu. Data is under the ``data/`` subdirectory."""
+    return download_hf_step(
+        "raw/fineweb-edu",
+        hf_dataset_id="HuggingFaceFW/fineweb-edu",
+        revision="87f0914",
+        override_output_path="raw/fineweb-edu-87f0914",
+    )
+
+
+def slimpajama_download() -> StepSpec:
+    return download_hf_step(
+        "raw/SlimPajama-627B",
+        hf_dataset_id="cerebras/SlimPajama-627B",
+        revision="2d0accd",
+        override_output_path="raw/SlimPajama-627B-262830",
+    )
+
+
+def slimpajama_6b_download() -> StepSpec:
+    return download_hf_step(
+        "raw/SlimPajama-6B",
+        hf_dataset_id="DKYoon/SlimPajama-6B",
+        revision="b5f90f4",
+        override_output_path="raw/SlimPajama-6B-be35b7",
+    )
+
+
+def dolma3_mix_150b_1025_download() -> StepSpec:
+    return download_hf_step(
+        "raw/dolma3_mix-150B-1025",
+        hf_dataset_id="allenai/dolma3_mix-150B-1025",
+        revision="15d04ee",
+        override_output_path="raw/dolma3_mix-150B-1025-15d04ee",
+    )
+
+
+def dclm_baseline_download() -> StepSpec:
+    return download_hf_step(
+        "raw/dclm-baseline-1.0",
+        hf_dataset_id="mlfoundations/dclm-baseline-1.0",
+        revision="a3b142c",
+        override_output_path="raw/dclm",
+    )
+
+
+def the_stack_dedup_download() -> StepSpec:
+    return download_hf_step(
+        "raw/the-stack-dedup",
+        hf_dataset_id="bigcode/the-stack-dedup",
+        revision="17cad72",
+        override_output_path="raw/the-stack-dedup-4ba450",
+    )
+
+
+def proofpile_2_download() -> StepSpec:
+    return download_hf_step(
+        "raw/proof-pile-2",
+        hf_dataset_id="EleutherAI/proof-pile-2",
+        revision="901a927",
+        override_output_path="raw/proof-pile-2-f1b1d8",
+    )
+
+
+def the_pile_openwebtext2_download() -> StepSpec:
+    return download_hf_step(
+        "raw/the_pile_openwebtext2",
+        hf_dataset_id="vietgpt/the_pile_openwebtext2",
+        revision="1de27c6",
+        override_output_path="raw/the_pile_openwebtext2",
+    )
+
+
+def starcoderdata_download() -> StepSpec:
+    return download_hf_step(
+        "raw/starcoderdata",
+        hf_dataset_id="bigcode/starcoderdata",
+        revision="9fc30b5",
+        override_output_path="raw/starcoderdata-720c8c",
+    )
+
+
+def dclm_baseline_wrong_download() -> StepSpec:
+    """Legacy download with incorrect path. Kept for backward compat."""
+    return download_hf_step(
+        "raw/dclm-baseline-1.0",
+        hf_dataset_id="mlfoundations/dclm-baseline-1.0",
+        revision="a3b142c",
+        override_output_path="raw/dclm_WRONG_20250211/",
+    )

From 6ffda47a494173c7a5afc4b034b9af6b76525e02 Mon Sep 17 00:00:00 2001
From: Rafal Wojdyla <ravwojdyla@gmail.com>
Date: Wed, 25 Mar 2026 10:32:44 -0700
Subject: [PATCH 08/56] Fix mock targets in download tests to use canonical
 module paths

Update mock/patch targets in test_huggingface.py, test_nemotron_cc.py,
and test_dclm_hq.py to point at the canonical marin.datakit.download.*
locations. Add _relative_path_in_source to the HF download shim since
the test imports it.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .../src/marin/download/huggingface/download_hf.py  |  1 +
 tests/download/test_dclm_hq.py                     |  2 +-
 tests/download/test_huggingface.py                 | 14 +++++++-------
 tests/download/test_nemotron_cc.py                 |  4 ++--
 4 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/lib/marin/src/marin/download/huggingface/download_hf.py b/lib/marin/src/marin/download/huggingface/download_hf.py
index 9912a5d2c0..2dd0177806 100644
--- a/lib/marin/src/marin/download/huggingface/download_hf.py
+++ b/lib/marin/src/marin/download/huggingface/download_hf.py
@@ -3,6 +3,7 @@
 # Backward-compat shim. Canonical location: marin.datakit.download.huggingface
 
 from marin.datakit.download.huggingface import DownloadConfig as DownloadConfig
+from marin.datakit.download.huggingface import _relative_path_in_source as _relative_path_in_source
 from marin.datakit.download.huggingface import download_hf as download_hf
 from marin.datakit.download.huggingface import ensure_fsspec_path_writable as ensure_fsspec_path_writable
 from marin.datakit.download.huggingface import main as main
diff --git a/tests/download/test_dclm_hq.py b/tests/download/test_dclm_hq.py
index 1636f3c34d..21ec33c5b7 100644
--- a/tests/download/test_dclm_hq.py
+++ b/tests/download/test_dclm_hq.py
@@ -170,7 +170,7 @@ def mock_requests_get(url, **kwargs):
 
         raise ValueError(f"Unexpected URL: {url}")
 
-    with patch("marin.download.dclm_hq.download_dclm_hq_html.requests.get", side_effect=mock_requests_get):
+    with patch("marin.datakit.download.dclm_hq.requests.get", side_effect=mock_requests_get):
         cfg = DCLMHQDownloadConfig(input_path=str(tmp_path / "input"), output_path=str(output_dir))
         extract_dclm_hq_dump(cfg)
 
diff --git a/tests/download/test_huggingface.py b/tests/download/test_huggingface.py
index 1019c83633..24a5bc6169 100644
--- a/tests/download/test_huggingface.py
+++ b/tests/download/test_huggingface.py
@@ -10,13 +10,13 @@
 import pandas as pd
 import pytest
 
-from marin.download.huggingface.download_hf import (
+from marin.datakit.download.huggingface import (
     DownloadConfig,
     _relative_path_in_source,
     download_hf,
     stream_file_to_fsspec,
 )
-from marin.download.huggingface.stream_remove_columns import (
+from marin.datakit.download.stream_remove_columns import (
     DatasetConfig,
     prune_hf_dataset,
 )
@@ -81,7 +81,7 @@ def test_download_hf_basic(mock_hf_fs, tmp_path):
     )
 
     # Mock HfFileSystem creation
-    with patch("marin.download.huggingface.download_hf.HfFileSystem", return_value=hf_fs):
+    with patch("marin.datakit.download.huggingface.HfFileSystem", return_value=hf_fs):
         download_hf(cfg)
 
     # Verify files were downloaded
@@ -123,7 +123,7 @@ def test_download_hf_appends_sha_when_configured(mock_hf_fs, tmp_path):
         append_sha_to_path=True,
     )
 
-    with patch("marin.download.huggingface.download_hf.HfFileSystem", return_value=hf_fs):
+    with patch("marin.datakit.download.huggingface.HfFileSystem", return_value=hf_fs):
         download_hf(cfg)
 
     target_output = base_output_path / revision
@@ -189,7 +189,7 @@ def create_buffer():
     mock_fs.glob = Mock(return_value=["hf://datasets/test-org/test-dataset@main/data/file.parquet"])
     mock_fs.open = Mock(side_effect=lambda path, mode="rb": create_buffer())
 
-    with patch("marin.download.huggingface.stream_remove_columns.hf_fs", mock_fs):
+    with patch("marin.datakit.download.stream_remove_columns.hf_fs", mock_fs):
         prune_hf_dataset(cfg)
 
     # Verify output
@@ -229,8 +229,8 @@ def read(self, chunk_size):
     hf_fs.open.side_effect = lambda path, mode="rb", **_kwargs: FlakyReader()
 
     with (
-        patch("marin.download.huggingface.download_hf.HfFileSystem", return_value=hf_fs),
-        patch("marin.download.huggingface.download_hf.time.sleep", return_value=None),
+        patch("marin.datakit.download.huggingface.HfFileSystem", return_value=hf_fs),
+        patch("marin.datakit.download.huggingface.time.sleep", return_value=None),
     ):
         result = stream_file_to_fsspec(
             str(output_path),
diff --git a/tests/download/test_nemotron_cc.py b/tests/download/test_nemotron_cc.py
index 6f3bdff56c..04ac04e2af 100644
--- a/tests/download/test_nemotron_cc.py
+++ b/tests/download/test_nemotron_cc.py
@@ -11,8 +11,8 @@
 from iris.marin_fs import open_url as _real_open_url
 from marin.download.nemotron_cc.download_nemotron_cc import NemotronIngressConfig, download_nemotron_cc
 
-_OPEN_URL_TARGET = "marin.download.nemotron_cc.download_nemotron_cc.open_url"
-_REQUESTS_SESSION_TARGET = "marin.download.nemotron_cc.download_nemotron_cc.requests.Session"
+_OPEN_URL_TARGET = "marin.datakit.download.nemotron_cc.open_url"
+_REQUESTS_SESSION_TARGET = "marin.datakit.download.nemotron_cc.requests.Session"
 
 SAMPLE_NEMOTRON_RECORDS = [
     {

From 3aa8bcc16af8b0e530495d101a19b837b8aaf079 Mon Sep 17 00:00:00 2001
From: Rafal Wojdyla <ravwojdyla@gmail.com>
Date: Wed, 25 Mar 2026 11:05:04 -0700
Subject: [PATCH 09/56] Migrate all imports from marin.download.* to
 marin.datakit.download.*

Updates 23 files across experiments/, tests/, and lib/ to import from
the canonical marin.datakit.download.* paths. Removes the stale
datakit/download.py file left over from the package conversion.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .../common_pile/tokenize_common_pile.py       |  2 +-
 experiments/defaults.py                       |  2 +-
 experiments/eval_datasets.py                  |  2 +-
 .../evals/exp1600_uncheatable_evals.py        |  2 +-
 experiments/midtraining_datasets.py           |  2 +-
 experiments/models.py                         |  2 +-
 .../download_and_tokenize_fineweb2_hq.py      |  2 +-
 experiments/paloma.py                         |  2 +-
 experiments/posttrain/preference_datasets.py  |  2 +-
 experiments/pretraining_datasets/dolma.py     |  2 +-
 experiments/pretraining_datasets/dolmino.py   |  2 +-
 experiments/pretraining_datasets/nemotron.py  |  2 +-
 .../pretraining_datasets/nemotron_v2.py       |  2 +-
 experiments/tootsie/exp1063_upload_tootsie.py |  2 +-
 .../eval_datasets_overlap.py                  |  2 +-
 experiments/two_stage/data.py                 |  2 +-
 lib/marin/src/marin/datakit/download.py       | 62 -------------------
 .../tokenize/download_pretokenized.py         |  2 +-
 .../marin/speedrun/paloma_local_download.py   |  4 +-
 .../transform/dolmino/transform_dclm_hq.py    |  4 +-
 tests/download/test_ar5iv.py                  |  2 +-
 tests/download/test_dclm_hq.py                |  2 +-
 tests/download/test_nemotron_cc.py            |  2 +-
 tests/test_hfdataset_spec.py                  |  2 +-
 24 files changed, 25 insertions(+), 87 deletions(-)
 delete mode 100644 lib/marin/src/marin/datakit/download.py

diff --git a/experiments/common_pile/tokenize_common_pile.py b/experiments/common_pile/tokenize_common_pile.py
index cf90e364ee..faee07fc76 100644
--- a/experiments/common_pile/tokenize_common_pile.py
+++ b/experiments/common_pile/tokenize_common_pile.py
@@ -5,7 +5,7 @@
 
 from experiments.defaults import default_tokenize
 from experiments.llama import llama3_tokenizer
-from marin.download.huggingface.download_hf import DownloadConfig, download_hf
+from marin.datakit.download.huggingface import DownloadConfig, download_hf
 from marin.execution.executor import ExecutorStep, executor_main, this_output_path
 from marin.processing.tokenize.data_configs import TokenizerStep, lm_mixture_data_config
 
diff --git a/experiments/defaults.py b/experiments/defaults.py
index 2636c945b5..01e9583442 100644
--- a/experiments/defaults.py
+++ b/experiments/defaults.py
@@ -46,7 +46,7 @@
 from experiments.simple_sft_config import SimpleSFTConfig
 from experiments.simple_train_config import SimpleTrainConfig
 from levanter.utils.mesh import MeshConfig
-from marin.download.huggingface.download_hf import DownloadConfig, download_hf
+from marin.datakit.download.huggingface import DownloadConfig, download_hf
 from marin.evaluation.evaluation_config import EvalTaskConfig
 from marin.execution.executor import (
     ExecutorStep,
diff --git a/experiments/eval_datasets.py b/experiments/eval_datasets.py
index 1a79a4a994..f55df8b3fc 100644
--- a/experiments/eval_datasets.py
+++ b/experiments/eval_datasets.py
@@ -3,7 +3,7 @@
 
 import dataclasses
 
-from marin.download.huggingface.download_hf import DownloadConfig, download_hf
+from marin.datakit.download.huggingface import DownloadConfig, download_hf
 from marin.execution.executor import ExecutorStep, executor_main, this_output_path, versioned
 from marin.transform.huggingface.dataset_to_eval import DatasetConversionConfig, OutputFormatOptions, hf_dataset_to_jsonl
 
diff --git a/experiments/evals/exp1600_uncheatable_evals.py b/experiments/evals/exp1600_uncheatable_evals.py
index 50f57d63df..e2787a3351 100644
--- a/experiments/evals/exp1600_uncheatable_evals.py
+++ b/experiments/evals/exp1600_uncheatable_evals.py
@@ -22,7 +22,7 @@
 from experiments.models import ModelConfig as HFModelConfig, download_model_step
 from fray.cluster import ResourceConfig
 from levanter.compat.hf_checkpoints import HFCheckpointConverter
-from marin.download.uncheatable_eval.download import make_uncheatable_eval_step
+from marin.datakit.download.uncheatable_eval import make_uncheatable_eval_step
 from marin.evaluation.log_probs import default_lm_log_probs
 from marin.execution.executor import ExecutorStep, executor_main, output_path_of
 from marin.processing.tokenize import TokenizeConfig
diff --git a/experiments/midtraining_datasets.py b/experiments/midtraining_datasets.py
index f96217880d..2706f8a4e9 100644
--- a/experiments/midtraining_datasets.py
+++ b/experiments/midtraining_datasets.py
@@ -4,7 +4,7 @@
 from experiments.common_pile.tokenize_common_pile import stackv2_edu_filtered
 from experiments.defaults import default_download, default_tokenize
 from experiments.llama import llama3_tokenizer
-from marin.download.huggingface.download_hf import DownloadConfig, download_hf
+from marin.datakit.download.huggingface import DownloadConfig, download_hf
 from marin.execution import versioned
 from marin.execution.executor import ExecutorStep, this_output_path
 from marin.processing.tokenize import lm_mixture_data_config
diff --git a/experiments/models.py b/experiments/models.py
index 9e2a2db79b..972ca4f753 100644
--- a/experiments/models.py
+++ b/experiments/models.py
@@ -18,7 +18,7 @@
 
 from dataclasses import dataclass
 
-from marin.download.huggingface.download_hf import DownloadConfig, download_hf
+from marin.datakit.download.huggingface import DownloadConfig, download_hf
 from marin.execution.executor import ExecutorStep, this_output_path, versioned
 from marin.utils import get_directory_friendly_name
 
diff --git a/experiments/multilingual_fineweb2_hq/download_and_tokenize_fineweb2_hq.py b/experiments/multilingual_fineweb2_hq/download_and_tokenize_fineweb2_hq.py
index 6c93fba71a..a3fd2ae82a 100644
--- a/experiments/multilingual_fineweb2_hq/download_and_tokenize_fineweb2_hq.py
+++ b/experiments/multilingual_fineweb2_hq/download_and_tokenize_fineweb2_hq.py
@@ -13,7 +13,7 @@
 
 from experiments.llama import llama3_tokenizer
 from experiments.multilingual_fineweb2_hq.constants import FINEWEB2_DATASETS
-from marin.download.huggingface.download_hf import DownloadConfig, download_hf
+from marin.datakit.download.huggingface import DownloadConfig, download_hf
 from marin.execution.executor import ExecutorStep, executor_main, output_path_of, this_output_path, versioned
 from marin.processing.tokenize import TokenizeConfig, tokenize
 from marin.processing.tokenize.data_configs import TokenizerStep
diff --git a/experiments/paloma.py b/experiments/paloma.py
index 74bd98e25f..24c1a536df 100644
--- a/experiments/paloma.py
+++ b/experiments/paloma.py
@@ -9,7 +9,7 @@
 
 import os.path
 
-from marin.download.huggingface.download_hf import DownloadConfig as HfDownloadConfig, download_hf
+from marin.datakit.download.huggingface import DownloadConfig as HfDownloadConfig, download_hf
 
 # cyclic dependency
 # from experiments.llama import llama3_tokenizer
diff --git a/experiments/posttrain/preference_datasets.py b/experiments/posttrain/preference_datasets.py
index e93e94a61b..105722d2af 100644
--- a/experiments/posttrain/preference_datasets.py
+++ b/experiments/posttrain/preference_datasets.py
@@ -22,7 +22,7 @@
 from collections.abc import Sequence
 from dataclasses import dataclass, field
 
-from marin.download.huggingface.download_hf import DownloadConfig, download_hf
+from marin.datakit.download.huggingface import DownloadConfig, download_hf
 from marin.execution.executor import (
     ExecutorStep,
     executor_main,
diff --git a/experiments/pretraining_datasets/dolma.py b/experiments/pretraining_datasets/dolma.py
index 5c176c01f7..02b62df0aa 100644
--- a/experiments/pretraining_datasets/dolma.py
+++ b/experiments/pretraining_datasets/dolma.py
@@ -10,7 +10,7 @@
 
 import os.path
 
-from marin.download.huggingface.download_hf import DownloadConfig, download_hf
+from marin.datakit.download.huggingface import DownloadConfig, download_hf
 from marin.execution.executor import ExecutorStep, this_output_path, versioned, InputName
 from marin.processing.tokenize import TokenizeConfig, tokenize
 from marin.processing.tokenize.data_configs import TokenizerStep
diff --git a/experiments/pretraining_datasets/dolmino.py b/experiments/pretraining_datasets/dolmino.py
index 414e0e28dc..25dab84f52 100644
--- a/experiments/pretraining_datasets/dolmino.py
+++ b/experiments/pretraining_datasets/dolmino.py
@@ -5,7 +5,7 @@
 
 import os.path
 
-from marin.download.huggingface.download_hf import DownloadConfig, download_hf
+from marin.datakit.download.huggingface import DownloadConfig, download_hf
 from marin.execution.executor import ExecutorStep, this_output_path, versioned
 from marin.processing.tokenize import TokenizeConfig, tokenize
 from marin.processing.tokenize.data_configs import TokenizerStep
diff --git a/experiments/pretraining_datasets/nemotron.py b/experiments/pretraining_datasets/nemotron.py
index 52c9d17d69..ac981b9720 100644
--- a/experiments/pretraining_datasets/nemotron.py
+++ b/experiments/pretraining_datasets/nemotron.py
@@ -8,7 +8,7 @@
 
 from experiments.defaults import DEFAULT_NEW_RUN_DATA_SHUFFLE
 from experiments.pretraining_datasets.dclm import dclm_components_llama3
-from marin.download.nemotron_cc.download_nemotron_cc import NemotronIngressConfig, download_nemotron_cc
+from marin.datakit.download.nemotron_cc import NemotronIngressConfig, download_nemotron_cc
 from marin.execution.executor import ExecutorStep, output_path_of, this_output_path, versioned
 from marin.processing.tokenize import TokenizeConfig, lm_mixture_data_config, tokenize
 from marin.processing.tokenize.data_configs import TokenizerStep
diff --git a/experiments/pretraining_datasets/nemotron_v2.py b/experiments/pretraining_datasets/nemotron_v2.py
index 66d618ad53..ccb79f9e14 100644
--- a/experiments/pretraining_datasets/nemotron_v2.py
+++ b/experiments/pretraining_datasets/nemotron_v2.py
@@ -14,7 +14,7 @@
 
 import os.path
 
-from marin.download.huggingface.download_hf import DownloadConfig, download_hf
+from marin.datakit.download.huggingface import DownloadConfig, download_hf
 from marin.execution.executor import ExecutorStep, this_output_path, versioned
 from marin.processing.tokenize import TokenizeConfig, tokenize
 from marin.processing.tokenize.data_configs import TokenizerStep
diff --git a/experiments/tootsie/exp1063_upload_tootsie.py b/experiments/tootsie/exp1063_upload_tootsie.py
index 55d06ec875..d12aa5e060 100644
--- a/experiments/tootsie/exp1063_upload_tootsie.py
+++ b/experiments/tootsie/exp1063_upload_tootsie.py
@@ -25,7 +25,7 @@
 
 from dataclasses import dataclass, field
 
-from marin.download.huggingface.upload_gcs_to_hf import UploadConfig, upload_gcs_to_hf
+from marin.datakit.download.upload_gcs_to_hf import UploadConfig, upload_gcs_to_hf
 from marin.execution.executor import ExecutorStep, executor_main
 
 
diff --git a/experiments/train_test_overlap/eval_datasets_overlap.py b/experiments/train_test_overlap/eval_datasets_overlap.py
index c6e7469221..b7df8679aa 100644
--- a/experiments/train_test_overlap/eval_datasets_overlap.py
+++ b/experiments/train_test_overlap/eval_datasets_overlap.py
@@ -1,7 +1,7 @@
 # Copyright The Marin Authors
 # SPDX-License-Identifier: Apache-2.0
 
-from marin.download.huggingface.download_hf import DownloadConfig, download_hf
+from marin.datakit.download.huggingface import DownloadConfig, download_hf
 from marin.execution.executor import ExecutorStep, executor_main, this_output_path, versioned
 from marin.transform.huggingface.dataset_to_eval import DatasetConversionConfig, OutputFormatOptions, hf_dataset_to_jsonl
 
diff --git a/experiments/two_stage/data.py b/experiments/two_stage/data.py
index 9aeca84456..c78daf0ab1 100644
--- a/experiments/two_stage/data.py
+++ b/experiments/two_stage/data.py
@@ -6,7 +6,7 @@
 from experiments.midtraining_datasets import finemath_3_plus_tokenized
 from experiments.pretraining_datasets import tokenize_dolma
 from experiments.pretraining_datasets.simple import tokenized
-from marin.download.huggingface.download_hf import DownloadConfig, download_hf
+from marin.datakit.download.huggingface import DownloadConfig, download_hf
 from marin.execution.executor import ExecutorStep, this_output_path
 
 dolma_components = tokenize_dolma()
diff --git a/lib/marin/src/marin/datakit/download.py b/lib/marin/src/marin/datakit/download.py
deleted file mode 100644
index 0724472143..0000000000
--- a/lib/marin/src/marin/datakit/download.py
+++ /dev/null
@@ -1,62 +0,0 @@
-# Copyright The Marin Authors
-# SPDX-License-Identifier: Apache-2.0
-
-"""Datakit download stage — fetch a HuggingFace dataset to persistent storage."""
-
-from marin.download.huggingface.download_hf import DownloadConfig, download_hf
-from marin.execution.step_spec import StepSpec
-
-
-def download_step(
-    name: str,
-    *,
-    hf_dataset_id: str,
-    revision: str,
-    hf_urls_glob: list[str] | None = None,
-    zephyr_max_parallelism: int = 8,
-    deps: list[StepSpec] | None = None,
-    output_path_prefix: str | None = None,
-    override_output_path: str | None = None,
-) -> StepSpec:
-    """Create a StepSpec that downloads a HuggingFace dataset.
-
-    The raw download is preserved as-is in its original format and directory structure.
-
-    Args:
-        name: Step name (e.g. "fineweb/download").
-        hf_dataset_id: HuggingFace dataset identifier (e.g. "HuggingFaceFW/fineweb").
-        revision: Commit hash from the HF dataset repo.
-        hf_urls_glob: Glob patterns to select specific files. Empty means all files.
-        zephyr_max_parallelism: Maximum download parallelism.
-        deps: Optional upstream dependencies.
-        output_path_prefix: Override the default output path prefix.
-        override_output_path: Override the computed output path entirely.
-
-    Returns:
-        A StepSpec whose output_path contains the raw downloaded files.
-    """
-    resolved_glob = hf_urls_glob or []
-
-    def _run(output_path: str) -> None:
-        download_hf(
-            DownloadConfig(
-                hf_dataset_id=hf_dataset_id,
-                revision=revision,
-                hf_urls_glob=resolved_glob,
-                gcs_output_path=output_path,
-                zephyr_max_parallelism=zephyr_max_parallelism,
-            )
-        )
-
-    return StepSpec(
-        name=name,
-        fn=_run,
-        deps=deps or [],
-        hash_attrs={
-            "hf_dataset_id": hf_dataset_id,
-            "revision": revision,
-            "hf_urls_glob": resolved_glob,
-        },
-        output_path_prefix=output_path_prefix,
-        override_output_path=override_output_path,
-    )
diff --git a/lib/marin/src/marin/processing/tokenize/download_pretokenized.py b/lib/marin/src/marin/processing/tokenize/download_pretokenized.py
index 91fdaca495..cab2433bec 100644
--- a/lib/marin/src/marin/processing/tokenize/download_pretokenized.py
+++ b/lib/marin/src/marin/processing/tokenize/download_pretokenized.py
@@ -18,7 +18,7 @@
 )
 from levanter.store.cache import CacheOptions
 
-from marin.download.huggingface.download_hf import (
+from marin.datakit.download.huggingface import (
     DownloadConfig as HfDownloadConfig,
     download_hf as hf_download_logic,
 )
diff --git a/lib/marin/src/marin/speedrun/paloma_local_download.py b/lib/marin/src/marin/speedrun/paloma_local_download.py
index c7335a52c5..e2ee68f766 100644
--- a/lib/marin/src/marin/speedrun/paloma_local_download.py
+++ b/lib/marin/src/marin/speedrun/paloma_local_download.py
@@ -8,8 +8,8 @@
 """
 
 from experiments.paloma import paloma_tokenized
-from marin.download import HfDownloadConfig
-from marin.download.huggingface.download_hf import download_hf
+from marin.datakit.download.huggingface import DownloadConfig as HfDownloadConfig
+from marin.datakit.download.huggingface import download_hf
 from marin.execution.executor import ExecutorStep, executor_main, this_output_path, versioned
 
 llama3_tokenizer = "meta-llama/Meta-Llama-3.1-8B"
diff --git a/lib/marin/src/marin/transform/dolmino/transform_dclm_hq.py b/lib/marin/src/marin/transform/dolmino/transform_dclm_hq.py
index 773cb3242a..dfaf263121 100644
--- a/lib/marin/src/marin/transform/dolmino/transform_dclm_hq.py
+++ b/lib/marin/src/marin/transform/dolmino/transform_dclm_hq.py
@@ -40,8 +40,8 @@
 
 import draccus
 from iris.marin_fs import open_url, url_to_fs
-from marin.download.dclm_hq.download_dclm_hq_html import find_html_in_cc
-from marin.download.huggingface.stream_remove_columns import hf_fs
+from marin.datakit.download.dclm_hq import find_html_in_cc
+from marin.datakit.download.stream_remove_columns import hf_fs
 from marin.schemas.web.convert import ExtractionConfig
 from marin.web.convert import convert_page
 from tqdm import tqdm
diff --git a/tests/download/test_ar5iv.py b/tests/download/test_ar5iv.py
index 442d557637..570fb706e3 100644
--- a/tests/download/test_ar5iv.py
+++ b/tests/download/test_ar5iv.py
@@ -7,7 +7,7 @@
 
 import pytest
 
-from marin.download.ar5iv.download import DownloadConfig, download
+from marin.datakit.download.ar5iv import Ar5ivDownloadConfig as DownloadConfig, download
 
 
 @pytest.fixture
diff --git a/tests/download/test_dclm_hq.py b/tests/download/test_dclm_hq.py
index 21ec33c5b7..4ca4f48aef 100644
--- a/tests/download/test_dclm_hq.py
+++ b/tests/download/test_dclm_hq.py
@@ -7,7 +7,7 @@
 from unittest.mock import patch
 
 import zstandard as zstd
-from marin.download.dclm_hq.download_dclm_hq_html import DCLMHQDownloadConfig, extract_dclm_hq_dump
+from marin.datakit.download.dclm_hq import DCLMHQDownloadConfig, extract_dclm_hq_dump
 
 SAMPLE_DCLM_RECORDS = [
     {
diff --git a/tests/download/test_nemotron_cc.py b/tests/download/test_nemotron_cc.py
index 04ac04e2af..4657d008eb 100644
--- a/tests/download/test_nemotron_cc.py
+++ b/tests/download/test_nemotron_cc.py
@@ -9,7 +9,7 @@
 import pytest
 import zstandard as zstd
 from iris.marin_fs import open_url as _real_open_url
-from marin.download.nemotron_cc.download_nemotron_cc import NemotronIngressConfig, download_nemotron_cc
+from marin.datakit.download.nemotron_cc import NemotronIngressConfig, download_nemotron_cc
 
 _OPEN_URL_TARGET = "marin.datakit.download.nemotron_cc.open_url"
 _REQUESTS_SESSION_TARGET = "marin.datakit.download.nemotron_cc.requests.Session"
diff --git a/tests/test_hfdataset_spec.py b/tests/test_hfdataset_spec.py
index 7bdd0d535c..14ad782471 100644
--- a/tests/test_hfdataset_spec.py
+++ b/tests/test_hfdataset_spec.py
@@ -2,7 +2,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from experiments.defaults import default_download, default_tokenize
-from marin.download.huggingface.download_hf import DownloadConfig
+from marin.datakit.download.huggingface import DownloadConfig
 from marin.processing.tokenize import HfDatasetSpec
 from marin.processing.tokenize.tokenize import HfTokenizeConfig, TokenizeConfig
 

From ef02bd877aa0f444377e963e2ea70004a3b7d7c5 Mon Sep 17 00:00:00 2001
From: Rafal Wojdyla <ravwojdyla@gmail.com>
Date: Wed, 25 Mar 2026 11:08:42 -0700
Subject: [PATCH 10/56] Migrate imports to canonical paths and simplify
 download functions

Updates all 23 consumer files to import from marin.datakit.download.*
instead of marin.download.*. Refactors download functions (transfer_files,
download_nemotron_cc, extract_dclm_hq_dump) to accept plain parameters
instead of requiring config dataclass construction. Config classes are
kept for backward compat with ExecutorStep callers.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .../src/marin/datakit/download/dclm_hq.py     | 35 ++++++-----
 .../src/marin/datakit/download/filesystem.py  | 63 ++++++++-----------
 .../src/marin/datakit/download/nemotron_cc.py | 23 +++++--
 3 files changed, 64 insertions(+), 57 deletions(-)

diff --git a/lib/marin/src/marin/datakit/download/dclm_hq.py b/lib/marin/src/marin/datakit/download/dclm_hq.py
index 83c127c079..ab00c2037d 100644
--- a/lib/marin/src/marin/datakit/download/dclm_hq.py
+++ b/lib/marin/src/marin/datakit/download/dclm_hq.py
@@ -172,35 +172,40 @@ def process_file(task: FileTask) -> None:
         raise
 
 
-def extract_dclm_hq_dump(cfg: DCLMHQDownloadConfig) -> None:
-    """Process the DCLM HQ dump in the input path and save the results to the output path.
+def extract_dclm_hq_dump(input_path_or_cfg: str | DCLMHQDownloadConfig, output_path: str | None = None) -> None:
+    """Process the DCLM HQ dump and enrich with HTML from Common Crawl.
 
-    Flattens the nested directory structure (shards → files) into a single list of files
-    and processes them in parallel using zephyr.
+    Args:
+        input_path_or_cfg: Input directory path, or a DCLMHQDownloadConfig for backward compat.
+        output_path: Output directory path. Required when input_path_or_cfg is a string.
     """
-    logger.info(f"Starting processing of DCLM HQ dump in {cfg.input_path}")
+    if isinstance(input_path_or_cfg, DCLMHQDownloadConfig):
+        input_path = input_path_or_cfg.input_path
+        output_path = input_path_or_cfg.output_path
+    else:
+        input_path = input_path_or_cfg
+        if output_path is None:
+            raise ValueError("output_path is required when input_path_or_cfg is a string")
+
+    logger.info(f"Starting processing of DCLM HQ dump in {input_path}")
 
-    # Flatten nested structure: discover all files upfront
     all_files = []
-    paths = [i.split("/")[-1] for i in fsspec_glob(os.path.join(cfg.input_path, "*"))]
+    paths = [i.split("/")[-1] for i in fsspec_glob(os.path.join(input_path, "*"))]
 
     logger.info(f"Found {len(paths)} shards to process")
 
     for path in paths:
-        input_path = os.path.join(cfg.input_path, path)
-        shard_paths = fsspec_glob(os.path.join(input_path, "*.json.zst"))
+        shard_input = os.path.join(input_path, path)
+        shard_paths = fsspec_glob(os.path.join(shard_input, "*.json.zst"))
 
         for shard_path in shard_paths:
-            input_file_path = shard_path
-            output_file_path = os.path.join(cfg.output_path, path, os.path.basename(shard_path)).replace(
+            output_file_path = os.path.join(output_path, path, os.path.basename(shard_path)).replace(
                 ".json.zst", ".jsonl.gz"
             )
-
-            all_files.append(FileTask(input_file_path=input_file_path, output_file_path=output_file_path))
+            all_files.append(FileTask(input_file_path=shard_path, output_file_path=output_file_path))
 
     logger.info(f"Found {len(all_files)} files to process")
 
-    # Single-level parallelism over all files
     pipeline = Dataset.from_list(all_files).map(process_file)
 
     ctx = ZephyrContext(name="download-dclm-html")
@@ -220,7 +225,7 @@ def dclm_hq_step(
     """Create a StepSpec that downloads DCLM HQ HTML data from Common Crawl."""
 
     def _run(output_path: str) -> None:
-        extract_dclm_hq_dump(DCLMHQDownloadConfig(input_path=input_path, output_path=output_path))
+        extract_dclm_hq_dump(input_path, output_path)
 
     return StepSpec(
         name=name,
diff --git a/lib/marin/src/marin/datakit/download/filesystem.py b/lib/marin/src/marin/datakit/download/filesystem.py
index 287426666f..0177d644ef 100644
--- a/lib/marin/src/marin/datakit/download/filesystem.py
+++ b/lib/marin/src/marin/datakit/download/filesystem.py
@@ -1,6 +1,7 @@
 # Copyright The Marin Authors
 # SPDX-License-Identifier: Apache-2.0
 
+import logging
 import os
 import random
 import time
@@ -12,61 +13,58 @@
 
 from marin.utils import fsspec_exists, fsspec_glob
 
+logger = logging.getLogger(__name__)
+
 
 @dataclass
 class TransferConfig:
+    """Kept for backward compatibility. Prefer ``transfer_files()`` with flat params."""
+
     input_path: str
     output_path: str
-
-    # Selectively choose the number of random files to transfer. None means all files
     num_random_files: int | None = None
     filetype: str = "jsonl.zst"
 
 
-def transfer_files(config: TransferConfig) -> None:
-    """Transfers files from the input path to the output path.
+def transfer_files(
+    input_path: str,
+    output_path: str,
+    *,
+    num_random_files: int | None = None,
+    filetype: str = "jsonl.zst",
+) -> None:
+    """Transfer files from input_path to output_path.
 
-    When num_random_files is None, copies the entire directory recursively.
-    When num_random_files is specified, randomly samples that many files and
-    copies them in parallel using zephyr.
+    When num_random_files is None, copies all matching files.
+    When specified, randomly samples that many files.
     """
-    if config.input_path.endswith("/"):
-        input_path = config.input_path[:-1]
-    else:
-        input_path = config.input_path
+    input_path = input_path.rstrip("/")
 
-    print(f"Downloading {input_path} from GCS.")
-    start_time: float = time.time()
+    logger.info("Transferring %s to %s", input_path, output_path)
+    start_time = time.time()
     fs, _ = url_to_fs(input_path)
     if not fs.exists(input_path):
         raise FileNotFoundError(f"{input_path} does not exist.")
 
-    # Glob all matching files
-    filenames = fsspec_glob(os.path.join(input_path, f"**/*.{config.filetype}"))
+    filenames = fsspec_glob(os.path.join(input_path, f"**/*.{filetype}"))
 
-    # Select files: either random sample or all files
-    if config.num_random_files is None:
-        selected_files = filenames
-    else:
+    if num_random_files is not None:
         random.seed(42)
         random.shuffle(filenames)
-        selected_files = filenames[: config.num_random_files]
+        filenames = filenames[:num_random_files]
 
     def copy_file(filename: str) -> None:
-        """Copy a single file if it doesn't already exist at destination."""
-        output_filename = os.path.join(config.output_path, os.path.basename(filename))
+        output_filename = os.path.join(output_path, os.path.basename(filename))
         if not fsspec_exists(output_filename):
-            # Ensure output directory exists
-            fs.makedirs(config.output_path, exist_ok=True)
+            fs.makedirs(output_path, exist_ok=True)
             fs.copy(filename, output_filename)
 
-    # Always use parallel copying via zephyr
-    pipeline = Dataset.from_list(selected_files).map(copy_file)
+    pipeline = Dataset.from_list(filenames).map(copy_file)
     ctx = ZephyrContext(name="fs-transfer")
     ctx.execute(pipeline)
 
-    elapsed_time_seconds: float = time.time() - start_time
-    print(f"Downloaded {input_path} to {config.output_path} ({elapsed_time_seconds}s).")
+    elapsed = time.time() - start_time
+    logger.info("Transferred %s to %s (%.1fs)", input_path, output_path, elapsed)
 
 
 def transfer_step(
@@ -82,14 +80,7 @@ def transfer_step(
     """Create a StepSpec that transfers files between fsspec paths."""
 
     def _run(output_path: str) -> None:
-        transfer_files(
-            TransferConfig(
-                input_path=input_path,
-                output_path=output_path,
-                num_random_files=num_random_files,
-                filetype=filetype,
-            )
-        )
+        transfer_files(input_path, output_path, num_random_files=num_random_files, filetype=filetype)
 
     return StepSpec(
         name=name,
diff --git a/lib/marin/src/marin/datakit/download/nemotron_cc.py b/lib/marin/src/marin/datakit/download/nemotron_cc.py
index 4b32983091..ba06ba00f2 100644
--- a/lib/marin/src/marin/datakit/download/nemotron_cc.py
+++ b/lib/marin/src/marin/datakit/download/nemotron_cc.py
@@ -87,11 +87,22 @@ def download_single_nemotron_path(input_file_path: str, output_file_path: str) -
 
 @dataclass
 class NemotronIngressConfig:
+    """Kept for backward compatibility with ExecutorStep callers."""
+
     output_path: str = THIS_OUTPUT_PATH
 
 
-def download_nemotron_cc(cfg: NemotronIngressConfig):
-    paths_file_path = os.path.join(cfg.output_path, "data-jsonl.paths")
+def download_nemotron_cc(output_path_or_cfg: str | NemotronIngressConfig) -> None:
+    """Download and process Nemotron-CC dataset from Common Crawl.
+
+    Args:
+        output_path_or_cfg: Output directory path, or a NemotronIngressConfig for backward compat.
+    """
+    output_path = (
+        output_path_or_cfg.output_path if isinstance(output_path_or_cfg, NemotronIngressConfig) else output_path_or_cfg
+    )
+
+    paths_file_path = os.path.join(output_path, "data-jsonl.paths")
     logger.info(f"Downloading Nemotron CC path file {paths_file_path}")
 
     with open_url(NCC_PATH_FILE_URL, "rb") as f, open_url(paths_file_path, "wb") as f_out:
@@ -102,7 +113,7 @@ def download_nemotron_cc(cfg: NemotronIngressConfig):
     with open_url(paths_file_path, "r", compression="gzip") as f:
         for line in f:
             file = line.strip()
-            output_file_path = os.path.join(cfg.output_path, file).replace("jsonl.zstd", "jsonl.zst")
+            output_file_path = os.path.join(output_path, file).replace("jsonl.zstd", "jsonl.zst")
             all_files.append((file, output_file_path))
 
     logger.info(f"Processing {len(all_files)} Nemotron CC files")
@@ -111,13 +122,13 @@ def download_nemotron_cc(cfg: NemotronIngressConfig):
         Dataset.from_list(all_files)
         .filter(lambda file_info: not fsspec_exists(file_info[1]))
         .map(lambda file_info: download_single_nemotron_path(*file_info))
-        .write_jsonl(os.path.join(cfg.output_path, ".metrics/download-{shard:05d}.jsonl"), skip_existing=True)
+        .write_jsonl(os.path.join(output_path, ".metrics/download-{shard:05d}.jsonl"), skip_existing=True)
     )
 
     ctx = ZephyrContext(name="download-nemotron-cc")
     ctx.execute(pipeline)
 
-    logger.info(f"Downloaded Nemotron CC files to {cfg.output_path}")
+    logger.info(f"Downloaded Nemotron CC files to {output_path}")
 
 
 def nemotron_cc_step(
@@ -130,7 +141,7 @@ def nemotron_cc_step(
     """Create a StepSpec that downloads the Nemotron-CC dataset from Common Crawl."""
 
     def _run(output_path: str) -> None:
-        download_nemotron_cc(NemotronIngressConfig(output_path=output_path))
+        download_nemotron_cc(output_path)
 
     return StepSpec(
         name=name,

From 4c8f38f3a52688589446478c45e272a81bd2c14a Mon Sep 17 00:00:00 2001
From: Rafal Wojdyla <ravwojdyla@gmail.com>
Date: Wed, 25 Mar 2026 12:09:47 -0700
Subject: [PATCH 11/56] Remove unused config dataclasses from download
 functions

Removes NemotronIngressConfig, DCLMHQDownloadConfig, and TransferConfig.
The underlying functions (download_nemotron_cc, extract_dclm_hq_dump,
transfer_files) now take plain parameters directly. Updates tests and
nemotron.py experiment to use the flat-param API or *_step() functions.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 experiments/pretraining_datasets/nemotron.py  | 10 ++------
 .../src/marin/datakit/download/dclm_hq.py     | 23 ++-----------------
 .../src/marin/datakit/download/filesystem.py  | 11 ---------
 .../src/marin/datakit/download/nemotron_cc.py | 20 ++--------------
 .../download/dclm_hq/download_dclm_hq_html.py |  1 -
 .../src/marin/download/filesystem/transfer.py |  1 -
 .../nemotron_cc/download_nemotron_cc.py       |  1 -
 tests/download/test_dclm_hq.py                |  5 ++--
 tests/download/test_nemotron_cc.py            | 11 ++++-----
 9 files changed, 12 insertions(+), 71 deletions(-)

diff --git a/experiments/pretraining_datasets/nemotron.py b/experiments/pretraining_datasets/nemotron.py
index ac981b9720..4c463d8e4f 100644
--- a/experiments/pretraining_datasets/nemotron.py
+++ b/experiments/pretraining_datasets/nemotron.py
@@ -8,20 +8,14 @@
 
 from experiments.defaults import DEFAULT_NEW_RUN_DATA_SHUFFLE
 from experiments.pretraining_datasets.dclm import dclm_components_llama3
-from marin.datakit.download.nemotron_cc import NemotronIngressConfig, download_nemotron_cc
+from marin.datakit.download.nemotron_cc import nemotron_cc_step
 from marin.execution.executor import ExecutorStep, output_path_of, this_output_path, versioned
 from marin.processing.tokenize import TokenizeConfig, lm_mixture_data_config, tokenize
 from marin.processing.tokenize.data_configs import TokenizerStep
 
 # Raw dataset download step
 downloads = {
-    "nemotron_cc": ExecutorStep(
-        name="raw/nemotro-cc",
-        fn=download_nemotron_cc,
-        config=NemotronIngressConfig(
-            output_path=this_output_path(),
-        ),
-    )
+    "nemotron_cc": nemotron_cc_step("raw/nemotro-cc").as_executor_step(),
 }
 
 _nemotron_cc_path = output_path_of(downloads["nemotron_cc"], "contrib/Nemotron/Nemotron-CC/data-jsonl/")
diff --git a/lib/marin/src/marin/datakit/download/dclm_hq.py b/lib/marin/src/marin/datakit/download/dclm_hq.py
index ab00c2037d..a4301245aa 100644
--- a/lib/marin/src/marin/datakit/download/dclm_hq.py
+++ b/lib/marin/src/marin/datakit/download/dclm_hq.py
@@ -34,12 +34,6 @@
 logger = logging.getLogger(__name__)
 
 
-@dataclass
-class DCLMHQDownloadConfig:
-    input_path: str
-    output_path: str
-
-
 @dataclass
 class FileTask:
     """Represents a single file processing task."""
@@ -172,21 +166,8 @@ def process_file(task: FileTask) -> None:
         raise
 
 
-def extract_dclm_hq_dump(input_path_or_cfg: str | DCLMHQDownloadConfig, output_path: str | None = None) -> None:
-    """Process the DCLM HQ dump and enrich with HTML from Common Crawl.
-
-    Args:
-        input_path_or_cfg: Input directory path, or a DCLMHQDownloadConfig for backward compat.
-        output_path: Output directory path. Required when input_path_or_cfg is a string.
-    """
-    if isinstance(input_path_or_cfg, DCLMHQDownloadConfig):
-        input_path = input_path_or_cfg.input_path
-        output_path = input_path_or_cfg.output_path
-    else:
-        input_path = input_path_or_cfg
-        if output_path is None:
-            raise ValueError("output_path is required when input_path_or_cfg is a string")
-
+def extract_dclm_hq_dump(input_path: str, output_path: str) -> None:
+    """Process the DCLM HQ dump and enrich with HTML from Common Crawl."""
     logger.info(f"Starting processing of DCLM HQ dump in {input_path}")
 
     all_files = []
diff --git a/lib/marin/src/marin/datakit/download/filesystem.py b/lib/marin/src/marin/datakit/download/filesystem.py
index 0177d644ef..7ace48ab38 100644
--- a/lib/marin/src/marin/datakit/download/filesystem.py
+++ b/lib/marin/src/marin/datakit/download/filesystem.py
@@ -5,7 +5,6 @@
 import os
 import random
 import time
-from dataclasses import dataclass
 
 from iris.marin_fs import url_to_fs
 from marin.execution.step_spec import StepSpec
@@ -16,16 +15,6 @@
 logger = logging.getLogger(__name__)
 
 
-@dataclass
-class TransferConfig:
-    """Kept for backward compatibility. Prefer ``transfer_files()`` with flat params."""
-
-    input_path: str
-    output_path: str
-    num_random_files: int | None = None
-    filetype: str = "jsonl.zst"
-
-
 def transfer_files(
     input_path: str,
     output_path: str,
diff --git a/lib/marin/src/marin/datakit/download/nemotron_cc.py b/lib/marin/src/marin/datakit/download/nemotron_cc.py
index ba06ba00f2..0e65f307b9 100644
--- a/lib/marin/src/marin/datakit/download/nemotron_cc.py
+++ b/lib/marin/src/marin/datakit/download/nemotron_cc.py
@@ -14,12 +14,10 @@
 import logging
 import os
 from collections.abc import Iterator
-from dataclasses import dataclass
 
 import requests
 import zstandard
 from iris.marin_fs import open_url
-from marin.execution import THIS_OUTPUT_PATH
 from marin.execution.step_spec import StepSpec
 from marin.utils import fsspec_exists
 from requests.adapters import HTTPAdapter
@@ -85,22 +83,8 @@ def download_single_nemotron_path(input_file_path: str, output_file_path: str) -
     return {"input_file": input_file_path, "output_file": output_file_path, "num_records": num_records}
 
 
-@dataclass
-class NemotronIngressConfig:
-    """Kept for backward compatibility with ExecutorStep callers."""
-
-    output_path: str = THIS_OUTPUT_PATH
-
-
-def download_nemotron_cc(output_path_or_cfg: str | NemotronIngressConfig) -> None:
-    """Download and process Nemotron-CC dataset from Common Crawl.
-
-    Args:
-        output_path_or_cfg: Output directory path, or a NemotronIngressConfig for backward compat.
-    """
-    output_path = (
-        output_path_or_cfg.output_path if isinstance(output_path_or_cfg, NemotronIngressConfig) else output_path_or_cfg
-    )
+def download_nemotron_cc(output_path: str) -> None:
+    """Download and process Nemotron-CC dataset from Common Crawl."""
 
     paths_file_path = os.path.join(output_path, "data-jsonl.paths")
     logger.info(f"Downloading Nemotron CC path file {paths_file_path}")
diff --git a/lib/marin/src/marin/download/dclm_hq/download_dclm_hq_html.py b/lib/marin/src/marin/download/dclm_hq/download_dclm_hq_html.py
index a49caab9d7..a98513e7df 100644
--- a/lib/marin/src/marin/download/dclm_hq/download_dclm_hq_html.py
+++ b/lib/marin/src/marin/download/dclm_hq/download_dclm_hq_html.py
@@ -2,7 +2,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # Backward-compat shim. Canonical location: marin.datakit.download.dclm_hq
 
-from marin.datakit.download.dclm_hq import DCLMHQDownloadConfig as DCLMHQDownloadConfig
 from marin.datakit.download.dclm_hq import FileTask as FileTask
 from marin.datakit.download.dclm_hq import extract_dclm_hq_dump as extract_dclm_hq_dump
 from marin.datakit.download.dclm_hq import fetch_warc_from_cc as fetch_warc_from_cc
diff --git a/lib/marin/src/marin/download/filesystem/transfer.py b/lib/marin/src/marin/download/filesystem/transfer.py
index 5456bf8cc5..045a360623 100644
--- a/lib/marin/src/marin/download/filesystem/transfer.py
+++ b/lib/marin/src/marin/download/filesystem/transfer.py
@@ -2,5 +2,4 @@
 # SPDX-License-Identifier: Apache-2.0
 # Backward-compat shim. Canonical location: marin.datakit.download.filesystem
 
-from marin.datakit.download.filesystem import TransferConfig as TransferConfig
 from marin.datakit.download.filesystem import transfer_files as transfer_files
diff --git a/lib/marin/src/marin/download/nemotron_cc/download_nemotron_cc.py b/lib/marin/src/marin/download/nemotron_cc/download_nemotron_cc.py
index 81251cb66c..c7e8e16e54 100644
--- a/lib/marin/src/marin/download/nemotron_cc/download_nemotron_cc.py
+++ b/lib/marin/src/marin/download/nemotron_cc/download_nemotron_cc.py
@@ -2,6 +2,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # Backward-compat shim. Canonical location: marin.datakit.download.nemotron_cc
 
-from marin.datakit.download.nemotron_cc import NemotronIngressConfig as NemotronIngressConfig
 from marin.datakit.download.nemotron_cc import download_nemotron_cc as download_nemotron_cc
 from marin.datakit.download.nemotron_cc import download_single_nemotron_path as download_single_nemotron_path
diff --git a/tests/download/test_dclm_hq.py b/tests/download/test_dclm_hq.py
index 4ca4f48aef..c83b5e03fe 100644
--- a/tests/download/test_dclm_hq.py
+++ b/tests/download/test_dclm_hq.py
@@ -7,7 +7,7 @@
 from unittest.mock import patch
 
 import zstandard as zstd
-from marin.datakit.download.dclm_hq import DCLMHQDownloadConfig, extract_dclm_hq_dump
+from marin.datakit.download.dclm_hq import extract_dclm_hq_dump
 
 SAMPLE_DCLM_RECORDS = [
     {
@@ -171,8 +171,7 @@ def mock_requests_get(url, **kwargs):
         raise ValueError(f"Unexpected URL: {url}")
 
     with patch("marin.datakit.download.dclm_hq.requests.get", side_effect=mock_requests_get):
-        cfg = DCLMHQDownloadConfig(input_path=str(tmp_path / "input"), output_path=str(output_dir))
-        extract_dclm_hq_dump(cfg)
+        extract_dclm_hq_dump(str(tmp_path / "input"), str(output_dir))
 
     # Verify output files were created in nested structure
     shard1_output = output_dir / "shard1"
diff --git a/tests/download/test_nemotron_cc.py b/tests/download/test_nemotron_cc.py
index 4657d008eb..e4e89e361a 100644
--- a/tests/download/test_nemotron_cc.py
+++ b/tests/download/test_nemotron_cc.py
@@ -9,7 +9,7 @@
 import pytest
 import zstandard as zstd
 from iris.marin_fs import open_url as _real_open_url
-from marin.datakit.download.nemotron_cc import NemotronIngressConfig, download_nemotron_cc
+from marin.datakit.download.nemotron_cc import download_nemotron_cc
 
 _OPEN_URL_TARGET = "marin.datakit.download.nemotron_cc.open_url"
 _REQUESTS_SESSION_TARGET = "marin.datakit.download.nemotron_cc.requests.Session"
@@ -114,8 +114,7 @@ def test_download_nemotron_cc_pipeline(tmp_path, mock_paths_open):
         patch(_OPEN_URL_TARGET, side_effect=mock_paths_open(paths)),
         patch(_REQUESTS_SESSION_TARGET, _mock_session_for({"file1": file1_data, "file2": file2_data})),
     ):
-        cfg = NemotronIngressConfig(output_path=str(output_dir))
-        download_nemotron_cc(cfg)
+        download_nemotron_cc(str(output_dir))
 
     all_records = read_all_jsonl_zst(output_dir / "contrib" / "Nemotron")
 
@@ -152,8 +151,7 @@ def test_download_nemotron_cc_dolma_format(tmp_path, mock_paths_open):
         patch(_OPEN_URL_TARGET, side_effect=mock_paths_open(paths)),
         patch(_REQUESTS_SESSION_TARGET, _mock_session_for({"test": compressed_data})),
     ):
-        cfg = NemotronIngressConfig(output_path=str(output_dir))
-        download_nemotron_cc(cfg)
+        download_nemotron_cc(str(output_dir))
 
     records = read_all_jsonl_zst(output_dir / "contrib" / "Nemotron")
     assert len(records) == 1
@@ -188,8 +186,7 @@ def test_download_nemotron_cc_skips_existing(tmp_path, mock_paths_open):
         patch(_OPEN_URL_TARGET, side_effect=mock_paths_open(paths)),
         patch(_REQUESTS_SESSION_TARGET) as mock_session,
     ):
-        cfg = NemotronIngressConfig(output_path=str(output_dir))
-        download_nemotron_cc(cfg)
+        download_nemotron_cc(str(output_dir))
 
     mock_session.return_value.get.assert_not_called()
     assert existing_output.read_text() == "existing"

From 16f5c20abcc9d705220a3b3b9f1f393105a86773 Mon Sep 17 00:00:00 2001
From: Rafal Wojdyla <ravwojdyla@gmail.com>
Date: Wed, 25 Mar 2026 12:13:23 -0700
Subject: [PATCH 12/56] Update datakit design doc: use Parquet instead of
 Vortex

Switches the standard format from Vortex to Parquet throughout the
design doc. Notes vortex#6905 as the blocking issue that motivated
the change. Parquet provides the same columnar benefits with a
mature ecosystem.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 docs/design/2355_datakit.md | 23 ++++++++++++-----------
 1 file changed, 12 insertions(+), 11 deletions(-)

diff --git a/docs/design/2355_datakit.md b/docs/design/2355_datakit.md
index 7ef15bf46e..0cb3ac8327 100644
--- a/docs/design/2355_datakit.md
+++ b/docs/design/2355_datakit.md
@@ -23,7 +23,7 @@ Download raw dataset from Hugging Face (or other sources). Raw downloads are pre
 
 Convert raw data into the **datakit standard format**:
 
-* **File format**: Vortex \- columnar, supports pushdown filters and column projection, efficient lookup.
+* **File format**: Parquet \- columnar, widely supported, supports pushdown filters and column projection.
 * **Mandatory columns**:
   * `id` \- unique document identifier (see [ID Column](#id-column) below)
   * `text` \- primary text content \- we enforce UTF-8
@@ -35,7 +35,7 @@ Convert raw data into the **datakit standard format**:
 * **Sort invariant**: each partition is sorted by `id`
 * **Typed output:** in the code the data has typed representation via `Artifact`
 
-This is the "intake" step \- all downstream stages operate on normalized Vortex datasets.
+This is the "intake" step \- all downstream stages operate on normalized Parquet datasets.
 
 ## 3\. Embed
 
@@ -56,7 +56,7 @@ Join attributes datasets back to the source documents and apply filters:
 * Filter by classifier thresholds (e.g., quality score \> 0.8)
 * Remove duplicate spans/documents
 
-Output is a clean, filtered Vortex dataset \- still sorted by `id`, still co-partitioned.
+Output is a clean, filtered Parquet dataset \- still sorted by `id`, still co-partitioned.
 
 ## 8\. Tokenize
 
@@ -66,15 +66,16 @@ Convert clean text into tokenized Levanter cache format.
 
 # Core Design Decisions
 
-## Vortex as the Standard Format
+## Parquet as the Standard Format
 
-All intermediate datasets (from normalization through consolidation) use the Vortex columnar format. Benefits:
+All intermediate datasets (from normalization through consolidation) use the Parquet columnar format. Benefits:
 
 * Column projection (only read the columns you need)
 * Filter pushdown
 * Efficient sorted merge joins via Zephyr
+* Mature ecosystem with broad tooling support
 
-NOTE: Vortex is much less mature than Parquet. This is a major concern. We will start with Vortex and if we hit roadblocks, revert to Parquet.
+NOTE: We initially considered Vortex for its pushdown and lookup capabilities, but encountered blocking issues with Zephyr pipeline integration (see [vortex\#6905](https://github.com/vortex-data/vortex/issues/6905)). Parquet provides the same columnar benefits with a proven ecosystem. If Vortex matures, we can revisit.
 
 ## ID Column {#id-column}
 
@@ -96,14 +97,14 @@ This is enforced by convention: each processing stage reads source partitions 1:
 
 ## Attributes Datasets {#attributes-datasets}
 
-Processing stages (embed, classify, dedup) produce **attributes datasets** \- lightweight Vortex files containing:
+Processing stages (embed, classify, dedup) produce **attributes datasets** \- lightweight Parquet files containing:
 
 * `id` — matching the source document ID
 * Stage-specific output columns (e.g., `quality_score`, `is_duplicate`, `topic_label`)
 
 Attributes datasets:
 
-* Use Vortex format
+* Use Parquet format
 * Are co-partitioned with the source (same shard count and key ranges)
 * Are sorted by `id` within each partition
 * Can be joined back to source documents via `sorted_merge_join`
@@ -133,7 +134,7 @@ download = StepSpec(
 normalize = StepSpec(
     name="fineweb/normalize",
     deps=[download],
-    fn=lambda output_path: normalize_to_vortex(
+    fn=lambda output_path: normalize_to_parquet(
         input_path=download.output_path, output_path=output_path, text_field="text",
     ),
     hash_attrs={"text_field": "text"},
@@ -188,7 +189,7 @@ Core primitives — the reusable building blocks:
 
 ```
 lib/marin/datakit/
-  normalize       # Raw format -> standard Vortex (id, text, ...)
+  normalize       # Raw format -> standard Parquet (id, text, ...)
   embed           # Document embedding
   classify        # Quality/topic classification
   dedup           # Deduplication (exact + fuzzy)
@@ -201,7 +202,7 @@ Dataset-specific wiring \- which transforms to apply for a given dataset, expres
 
 # Execution Plan
 
-* Implement `datakit/normalize.py` \- standard schema definitions, ID generation, raw format to Vortex conversion with mandatory columns
+* Implement `datakit/normalize.py` \- standard schema definitions, ID generation, raw format to Parquet conversion with mandatory columns
 * Integration tests for the normalize step
 * Integration tests covering download, normalize, dedup and tokenize at reasonable scale
 * Update Grug/ferry experiment definitions to consume datakit pipeline outputs directly

From 5af3272fcd8e61c2dc3177f5d2fb21ebdce6c28b Mon Sep 17 00:00:00 2001
From: Rafal Wojdyla <ravwojdyla@gmail.com>
Date: Wed, 25 Mar 2026 14:27:16 -0700
Subject: [PATCH 13/56] Remove global HfFileSystem() instance from
 stream_remove_columns

Replace the module-level hf_fs = HfFileSystem() with per-call
construction to avoid side effects at import time. Update the
one external consumer (transform_dclm_hq.py) and test mock target.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .../src/marin/datakit/download/stream_remove_columns.py      | 5 ++---
 .../src/marin/download/huggingface/stream_remove_columns.py  | 1 -
 lib/marin/src/marin/transform/dolmino/transform_dclm_hq.py   | 3 ++-
 tests/download/test_huggingface.py                           | 2 +-
 4 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/lib/marin/src/marin/datakit/download/stream_remove_columns.py b/lib/marin/src/marin/datakit/download/stream_remove_columns.py
index b16e3a1f1b..ba883ee944 100644
--- a/lib/marin/src/marin/datakit/download/stream_remove_columns.py
+++ b/lib/marin/src/marin/datakit/download/stream_remove_columns.py
@@ -13,7 +13,6 @@
 from tqdm import tqdm
 from zephyr import Dataset, ZephyrContext
 
-hf_fs = HfFileSystem()
 logger = logging.getLogger(__name__)
 
 
@@ -29,7 +28,7 @@ def prune_stream_and_save(input_file: str, output_file: str, keep_columns: list[
         output_file (str): Path where pruned parquet file will be saved
         keep_columns (list[str]): List of column names to retain
     """
-    parquet_file = pq.ParquetFile(hf_fs.open(input_file))
+    parquet_file = pq.ParquetFile(HfFileSystem().open(input_file))
 
     full_df_list = []
     for batch in tqdm(parquet_file.iter_batches(batch_size=10000), desc=f"Processing {input_file}"):
@@ -58,7 +57,7 @@ def get_file_tasks(hf_path: str, output_path: str, keep_columns: list[str]):
         Dict with input_file, output_file, and keep_columns for each parquet file
     """
     logger.info(f"Loading dataset from {hf_path}")
-    parquet_list = hf_fs.glob(f"{hf_path}/*.parquet")
+    parquet_list = HfFileSystem().glob(f"{hf_path}/*.parquet")
 
     for file in parquet_list:
         output_file = os.path.join(output_path, os.path.basename(file))
diff --git a/lib/marin/src/marin/download/huggingface/stream_remove_columns.py b/lib/marin/src/marin/download/huggingface/stream_remove_columns.py
index 6d5d39f492..68a44db40c 100644
--- a/lib/marin/src/marin/download/huggingface/stream_remove_columns.py
+++ b/lib/marin/src/marin/download/huggingface/stream_remove_columns.py
@@ -4,6 +4,5 @@
 
 from marin.datakit.download.stream_remove_columns import DatasetConfig as DatasetConfig
 from marin.datakit.download.stream_remove_columns import get_file_tasks as get_file_tasks
-from marin.datakit.download.stream_remove_columns import hf_fs as hf_fs
 from marin.datakit.download.stream_remove_columns import prune_hf_dataset as prune_hf_dataset
 from marin.datakit.download.stream_remove_columns import prune_stream_and_save as prune_stream_and_save
diff --git a/lib/marin/src/marin/transform/dolmino/transform_dclm_hq.py b/lib/marin/src/marin/transform/dolmino/transform_dclm_hq.py
index dfaf263121..42f04264bf 100644
--- a/lib/marin/src/marin/transform/dolmino/transform_dclm_hq.py
+++ b/lib/marin/src/marin/transform/dolmino/transform_dclm_hq.py
@@ -41,7 +41,7 @@
 import draccus
 from iris.marin_fs import open_url, url_to_fs
 from marin.datakit.download.dclm_hq import find_html_in_cc
-from marin.datakit.download.stream_remove_columns import hf_fs
+from huggingface_hub import HfFileSystem
 from marin.schemas.web.convert import ExtractionConfig
 from marin.web.convert import convert_page
 from tqdm import tqdm
@@ -115,6 +115,7 @@ def process_dclm_hq_dump(cfg: DCLMHQExtractionConfig) -> None:
 
     # Glob all files across all shards upfront
     all_files = []
+    hf_fs = HfFileSystem()
     paths = [i.split("/")[-1] for i in hf_fs.ls(cfg.input_hf_path, detail=False)]
     paths = paths[: cfg.max_split] if cfg.max_split else paths
 
diff --git a/tests/download/test_huggingface.py b/tests/download/test_huggingface.py
index 24a5bc6169..4d16eadf6b 100644
--- a/tests/download/test_huggingface.py
+++ b/tests/download/test_huggingface.py
@@ -189,7 +189,7 @@ def create_buffer():
     mock_fs.glob = Mock(return_value=["hf://datasets/test-org/test-dataset@main/data/file.parquet"])
     mock_fs.open = Mock(side_effect=lambda path, mode="rb": create_buffer())
 
-    with patch("marin.datakit.download.stream_remove_columns.hf_fs", mock_fs):
+    with patch("marin.datakit.download.stream_remove_columns.HfFileSystem", return_value=mock_fs):
         prune_hf_dataset(cfg)
 
     # Verify output

From f6959ecc1d1c72a2a25116bf2cddffdaafcbfabe Mon Sep 17 00:00:00 2001
From: Rafal Wojdyla <ravwojdyla@gmail.com>
Date: Wed, 25 Mar 2026 14:28:49 -0700
Subject: [PATCH 14/56] Inline pretraining downloads into simple.py, delete
 pretraining.py

Removes the single-call wrapper functions in pretraining.py and inlines
download_hf_step calls directly in simple.py via a _dl() helper. This
eliminates the indirection of one function per dataset.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 experiments/pretraining_datasets/simple.py    |  68 +++++-----
 .../src/marin/datakit/download/pretraining.py | 119 ------------------
 2 files changed, 37 insertions(+), 150 deletions(-)
 delete mode 100644 lib/marin/src/marin/datakit/download/pretraining.py

diff --git a/experiments/pretraining_datasets/simple.py b/experiments/pretraining_datasets/simple.py
index 79910f3741..5fa9a5fa65 100644
--- a/experiments/pretraining_datasets/simple.py
+++ b/experiments/pretraining_datasets/simple.py
@@ -12,19 +12,7 @@
 
 from levanter.data.text import TextLmDatasetFormat
 from levanter.store.cache import CacheOptions
-from marin.datakit.download.pretraining import (
-    dclm_baseline_download,
-    dclm_baseline_wrong_download,
-    dolma3_mix_150b_1025_download,
-    fineweb_download,
-    fineweb_edu_download,
-    proofpile_2_download,
-    slimpajama_6b_download,
-    slimpajama_download,
-    starcoderdata_download,
-    the_pile_openwebtext2_download,
-    the_stack_dedup_download,
-)
+from marin.datakit.download.huggingface import download_hf_step
 from marin.execution.executor import ExecutorStep, InputName, this_output_path, versioned
 from marin.processing.tokenize import TokenizeConfig, tokenize
 
@@ -65,42 +53,60 @@ def _tokenize_simple(
     return step
 
 
+def _dl(name: str, hf_dataset_id: str, revision: str, output_path: str) -> ExecutorStep:
+    """Create a download ExecutorStep from a StepSpec."""
+    return download_hf_step(
+        name, hf_dataset_id=hf_dataset_id, revision=revision, override_output_path=output_path
+    ).as_executor_step()
+
+
 # ============================================================================
 # RAW DATASET DOWNLOADS
 # ============================================================================
 
 
 def _build_downloads() -> dict[str, ExecutorStep | InputName]:
-    """Build the downloads dict from canonical StepSpec definitions in pretraining.py."""
-    fineweb_edu_base = fineweb_edu_download().as_executor_step()
+    fineweb_edu_base = _dl("raw/fineweb-edu", "HuggingFaceFW/fineweb-edu", "87f0914", "raw/fineweb-edu-87f0914")
 
     return {
-        "fineweb": fineweb_download().as_executor_step(),
+        "fineweb": _dl("raw/fineweb", "HuggingFaceFW/fineweb", "cd85054", "raw/fineweb"),
         "fineweb_edu": fineweb_edu_base.cd("data"),
         "fineweb_edu_sample_10bt": fineweb_edu_base.cd("sample/10BT"),
         "fineweb_edu_sample_100bt": fineweb_edu_base.cd("sample/100BT"),
         "fineweb_edu_sample_350bt": fineweb_edu_base.cd("sample/350BT"),
         "slimpajama": (
-            slimpajama_download()
-            .as_executor_step()
-            .cd("2d0accd/huggingface.co/datasets/cerebras/SlimPajama-627B/resolve/2d0accd")
+            _dl("raw/SlimPajama-627B", "cerebras/SlimPajama-627B", "2d0accd", "raw/SlimPajama-627B-262830").cd(
+                "2d0accd/huggingface.co/datasets/cerebras/SlimPajama-627B/resolve/2d0accd"
+            )
+        ),
+        "slimpajama_6b": (
+            _dl("raw/SlimPajama-6B", "DKYoon/SlimPajama-6B", "b5f90f4", "raw/SlimPajama-6B-be35b7").cd("data")
+        ),
+        "dolma3_mix_150b_1025": (
+            _dl(
+                "raw/dolma3_mix-150B-1025", "allenai/dolma3_mix-150B-1025", "15d04ee", "raw/dolma3_mix-150B-1025-15d04ee"
+            ).cd("15d04ee")
+        ),
+        "dclm_baseline_wrong": _dl(
+            "raw/dclm-baseline-1.0", "mlfoundations/dclm-baseline-1.0", "a3b142c", "raw/dclm_WRONG_20250211/"
+        ),
+        "dclm_baseline": (
+            _dl("raw/dclm-baseline-1.0", "mlfoundations/dclm-baseline-1.0", "a3b142c", "raw/dclm").cd("a3b142c")
+        ),
+        "the_stack_dedup": (
+            _dl("raw/the-stack-dedup", "bigcode/the-stack-dedup", "17cad72", "raw/the-stack-dedup-4ba450").cd("17cad72")
         ),
-        "slimpajama_6b": slimpajama_6b_download().as_executor_step().cd("data"),
-        "dolma3_mix_150b_1025": dolma3_mix_150b_1025_download().as_executor_step().cd("15d04ee"),
-        "dclm_baseline_wrong": dclm_baseline_wrong_download().as_executor_step(),
-        "dclm_baseline": dclm_baseline_download().as_executor_step().cd("a3b142c"),
-        "the_stack_dedup": the_stack_dedup_download().as_executor_step().cd("17cad72"),
         "proofpile_2": (
-            proofpile_2_download()
-            .as_executor_step()
-            .cd("901a927/huggingface.co/datasets/EleutherAI/proof-pile-2/resolve/901a927")
+            _dl("raw/proof-pile-2", "EleutherAI/proof-pile-2", "901a927", "raw/proof-pile-2-f1b1d8").cd(
+                "901a927/huggingface.co/datasets/EleutherAI/proof-pile-2/resolve/901a927"
+            )
         ),
         "the_pile_openwebtext2": (
-            the_pile_openwebtext2_download()
-            .as_executor_step()
-            .cd("1de27c6/huggingface.co/datasets/vietgpt/the_pile_openwebtext2/resolve/1de27c6")
+            _dl("raw/the_pile_openwebtext2", "vietgpt/the_pile_openwebtext2", "1de27c6", "raw/the_pile_openwebtext2").cd(
+                "1de27c6/huggingface.co/datasets/vietgpt/the_pile_openwebtext2/resolve/1de27c6"
+            )
         ),
-        "starcoderdata": starcoderdata_download().as_executor_step(),
+        "starcoderdata": _dl("raw/starcoderdata", "bigcode/starcoderdata", "9fc30b5", "raw/starcoderdata-720c8c"),
     }
 
 
diff --git a/lib/marin/src/marin/datakit/download/pretraining.py b/lib/marin/src/marin/datakit/download/pretraining.py
deleted file mode 100644
index 3300820ba3..0000000000
--- a/lib/marin/src/marin/datakit/download/pretraining.py
+++ /dev/null
@@ -1,119 +0,0 @@
-# Copyright The Marin Authors
-# SPDX-License-Identifier: Apache-2.0
-
-"""Pre-defined download steps for common pretraining datasets.
-
-Each function returns a StepSpec for downloading a specific dataset from
-HuggingFace. These are the canonical definitions — experiments should
-import from here rather than defining download steps inline.
-
-For datasets where the actual data lives in a subdirectory of the download
-(e.g. fineweb-edu has data under ``data/``), the function returns the
-StepSpec for the base download. Consumers that need the subdirectory path
-should use ``step.output_path + "/data"`` or convert to ExecutorStep and
-use ``.cd("data")``.
-"""
-
-from marin.datakit.download.huggingface import download_hf_step
-from marin.execution.step_spec import StepSpec
-
-
-def fineweb_download() -> StepSpec:
-    return download_hf_step(
-        "raw/fineweb",
-        hf_dataset_id="HuggingFaceFW/fineweb",
-        revision="cd85054",
-        override_output_path="raw/fineweb",
-    )
-
-
-def fineweb_edu_download() -> StepSpec:
-    """Base download for fineweb-edu. Data is under the ``data/`` subdirectory."""
-    return download_hf_step(
-        "raw/fineweb-edu",
-        hf_dataset_id="HuggingFaceFW/fineweb-edu",
-        revision="87f0914",
-        override_output_path="raw/fineweb-edu-87f0914",
-    )
-
-
-def slimpajama_download() -> StepSpec:
-    return download_hf_step(
-        "raw/SlimPajama-627B",
-        hf_dataset_id="cerebras/SlimPajama-627B",
-        revision="2d0accd",
-        override_output_path="raw/SlimPajama-627B-262830",
-    )
-
-
-def slimpajama_6b_download() -> StepSpec:
-    return download_hf_step(
-        "raw/SlimPajama-6B",
-        hf_dataset_id="DKYoon/SlimPajama-6B",
-        revision="b5f90f4",
-        override_output_path="raw/SlimPajama-6B-be35b7",
-    )
-
-
-def dolma3_mix_150b_1025_download() -> StepSpec:
-    return download_hf_step(
-        "raw/dolma3_mix-150B-1025",
-        hf_dataset_id="allenai/dolma3_mix-150B-1025",
-        revision="15d04ee",
-        override_output_path="raw/dolma3_mix-150B-1025-15d04ee",
-    )
-
-
-def dclm_baseline_download() -> StepSpec:
-    return download_hf_step(
-        "raw/dclm-baseline-1.0",
-        hf_dataset_id="mlfoundations/dclm-baseline-1.0",
-        revision="a3b142c",
-        override_output_path="raw/dclm",
-    )
-
-
-def the_stack_dedup_download() -> StepSpec:
-    return download_hf_step(
-        "raw/the-stack-dedup",
-        hf_dataset_id="bigcode/the-stack-dedup",
-        revision="17cad72",
-        override_output_path="raw/the-stack-dedup-4ba450",
-    )
-
-
-def proofpile_2_download() -> StepSpec:
-    return download_hf_step(
-        "raw/proof-pile-2",
-        hf_dataset_id="EleutherAI/proof-pile-2",
-        revision="901a927",
-        override_output_path="raw/proof-pile-2-f1b1d8",
-    )
-
-
-def the_pile_openwebtext2_download() -> StepSpec:
-    return download_hf_step(
-        "raw/the_pile_openwebtext2",
-        hf_dataset_id="vietgpt/the_pile_openwebtext2",
-        revision="1de27c6",
-        override_output_path="raw/the_pile_openwebtext2",
-    )
-
-
-def starcoderdata_download() -> StepSpec:
-    return download_hf_step(
-        "raw/starcoderdata",
-        hf_dataset_id="bigcode/starcoderdata",
-        revision="9fc30b5",
-        override_output_path="raw/starcoderdata-720c8c",
-    )
-
-
-def dclm_baseline_wrong_download() -> StepSpec:
-    """Legacy download with incorrect path. Kept for backward compat."""
-    return download_hf_step(
-        "raw/dclm-baseline-1.0",
-        hf_dataset_id="mlfoundations/dclm-baseline-1.0",
-        revision="a3b142c",
-        override_output_path="raw/dclm_WRONG_20250211/",
-    )

From ddb203797004cae44c687baa331f8f07e74673ba Mon Sep 17 00:00:00 2001
From: Rafal Wojdyla <ravwojdyla@gmail.com>
Date: Wed, 25 Mar 2026 14:33:25 -0700
Subject: [PATCH 15/56] Delete old marin.download/ shim directory

All imports have been migrated to marin.datakit.download.*. The shim
re-export layer has zero consumers and is now removed. Data files
(ar5iv JSON, stackexchange TSV) moved to datakit/download/data/.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .../download/data}/ar5iv-v04-2024.json              |  0
 .../download/data}/stackexchange/README.md          |  0
 .../data}/stackexchange/stackexchange-urls.tsv      |  0
 lib/marin/src/marin/download/__init__.py            |  7 -------
 lib/marin/src/marin/download/ar5iv/__init__.py      |  2 --
 lib/marin/src/marin/download/ar5iv/download.py      |  7 -------
 lib/marin/src/marin/download/dclm_hq/__init__.py    |  2 --
 .../marin/download/dclm_hq/download_dclm_hq_html.py |  9 ---------
 lib/marin/src/marin/download/filesystem/__init__.py |  2 --
 lib/marin/src/marin/download/filesystem/transfer.py |  5 -----
 .../src/marin/download/huggingface/__init__.py      |  2 --
 .../src/marin/download/huggingface/download_hf.py   | 13 -------------
 .../download/huggingface/stream_remove_columns.py   |  8 --------
 .../marin/download/huggingface/upload_gcs_to_hf.py  | 10 ----------
 .../src/marin/download/nemotron_cc/__init__.py      |  2 --
 .../download/nemotron_cc/download_nemotron_cc.py    |  6 ------
 .../src/marin/download/uncheatable_eval/__init__.py |  2 --
 .../src/marin/download/uncheatable_eval/download.py | 12 ------------
 lib/marin/src/marin/download/wikipedia/__init__.py  |  2 --
 lib/marin/src/marin/download/wikipedia/download.py  |  7 -------
 20 files changed, 98 deletions(-)
 rename lib/marin/src/marin/{download/ar5iv => datakit/download/data}/ar5iv-v04-2024.json (100%)
 rename lib/marin/src/marin/{download => datakit/download/data}/stackexchange/README.md (100%)
 rename lib/marin/src/marin/{download => datakit/download/data}/stackexchange/stackexchange-urls.tsv (100%)
 delete mode 100644 lib/marin/src/marin/download/__init__.py
 delete mode 100644 lib/marin/src/marin/download/ar5iv/__init__.py
 delete mode 100644 lib/marin/src/marin/download/ar5iv/download.py
 delete mode 100644 lib/marin/src/marin/download/dclm_hq/__init__.py
 delete mode 100644 lib/marin/src/marin/download/dclm_hq/download_dclm_hq_html.py
 delete mode 100644 lib/marin/src/marin/download/filesystem/__init__.py
 delete mode 100644 lib/marin/src/marin/download/filesystem/transfer.py
 delete mode 100644 lib/marin/src/marin/download/huggingface/__init__.py
 delete mode 100644 lib/marin/src/marin/download/huggingface/download_hf.py
 delete mode 100644 lib/marin/src/marin/download/huggingface/stream_remove_columns.py
 delete mode 100644 lib/marin/src/marin/download/huggingface/upload_gcs_to_hf.py
 delete mode 100644 lib/marin/src/marin/download/nemotron_cc/__init__.py
 delete mode 100644 lib/marin/src/marin/download/nemotron_cc/download_nemotron_cc.py
 delete mode 100644 lib/marin/src/marin/download/uncheatable_eval/__init__.py
 delete mode 100644 lib/marin/src/marin/download/uncheatable_eval/download.py
 delete mode 100644 lib/marin/src/marin/download/wikipedia/__init__.py
 delete mode 100644 lib/marin/src/marin/download/wikipedia/download.py

diff --git a/lib/marin/src/marin/download/ar5iv/ar5iv-v04-2024.json b/lib/marin/src/marin/datakit/download/data/ar5iv-v04-2024.json
similarity index 100%
rename from lib/marin/src/marin/download/ar5iv/ar5iv-v04-2024.json
rename to lib/marin/src/marin/datakit/download/data/ar5iv-v04-2024.json
diff --git a/lib/marin/src/marin/download/stackexchange/README.md b/lib/marin/src/marin/datakit/download/data/stackexchange/README.md
similarity index 100%
rename from lib/marin/src/marin/download/stackexchange/README.md
rename to lib/marin/src/marin/datakit/download/data/stackexchange/README.md
diff --git a/lib/marin/src/marin/download/stackexchange/stackexchange-urls.tsv b/lib/marin/src/marin/datakit/download/data/stackexchange/stackexchange-urls.tsv
similarity index 100%
rename from lib/marin/src/marin/download/stackexchange/stackexchange-urls.tsv
rename to lib/marin/src/marin/datakit/download/data/stackexchange/stackexchange-urls.tsv
diff --git a/lib/marin/src/marin/download/__init__.py b/lib/marin/src/marin/download/__init__.py
deleted file mode 100644
index 26067cbf97..0000000000
--- a/lib/marin/src/marin/download/__init__.py
+++ /dev/null
@@ -1,7 +0,0 @@
-# Copyright The Marin Authors
-# SPDX-License-Identifier: Apache-2.0
-# Backward-compat shim. Canonical location: marin.datakit.download
-
-from marin.datakit.download.huggingface import DownloadConfig as HfDownloadConfig
-from marin.datakit.download.huggingface import download_hf
-from marin.datakit.download.huggingface import download_hf as download_hf_ungated
diff --git a/lib/marin/src/marin/download/ar5iv/__init__.py b/lib/marin/src/marin/download/ar5iv/__init__.py
deleted file mode 100644
index ec8bc038b7..0000000000
--- a/lib/marin/src/marin/download/ar5iv/__init__.py
+++ /dev/null
@@ -1,2 +0,0 @@
-# Copyright The Marin Authors
-# SPDX-License-Identifier: Apache-2.0
diff --git a/lib/marin/src/marin/download/ar5iv/download.py b/lib/marin/src/marin/download/ar5iv/download.py
deleted file mode 100644
index 1a64dbf93e..0000000000
--- a/lib/marin/src/marin/download/ar5iv/download.py
+++ /dev/null
@@ -1,7 +0,0 @@
-# Copyright The Marin Authors
-# SPDX-License-Identifier: Apache-2.0
-# Backward-compat shim. Canonical location: marin.datakit.download.ar5iv
-
-from marin.datakit.download.ar5iv import Ar5ivDownloadConfig as DownloadConfig  # noqa: F401 - used by tests
-from marin.datakit.download.ar5iv import download as download
-from marin.datakit.download.ar5iv import process_shard as process_shard
diff --git a/lib/marin/src/marin/download/dclm_hq/__init__.py b/lib/marin/src/marin/download/dclm_hq/__init__.py
deleted file mode 100644
index ec8bc038b7..0000000000
--- a/lib/marin/src/marin/download/dclm_hq/__init__.py
+++ /dev/null
@@ -1,2 +0,0 @@
-# Copyright The Marin Authors
-# SPDX-License-Identifier: Apache-2.0
diff --git a/lib/marin/src/marin/download/dclm_hq/download_dclm_hq_html.py b/lib/marin/src/marin/download/dclm_hq/download_dclm_hq_html.py
deleted file mode 100644
index a98513e7df..0000000000
--- a/lib/marin/src/marin/download/dclm_hq/download_dclm_hq_html.py
+++ /dev/null
@@ -1,9 +0,0 @@
-# Copyright The Marin Authors
-# SPDX-License-Identifier: Apache-2.0
-# Backward-compat shim. Canonical location: marin.datakit.download.dclm_hq
-
-from marin.datakit.download.dclm_hq import FileTask as FileTask
-from marin.datakit.download.dclm_hq import extract_dclm_hq_dump as extract_dclm_hq_dump
-from marin.datakit.download.dclm_hq import fetch_warc_from_cc as fetch_warc_from_cc
-from marin.datakit.download.dclm_hq import find_html_in_cc as find_html_in_cc
-from marin.datakit.download.dclm_hq import process_file as process_file
diff --git a/lib/marin/src/marin/download/filesystem/__init__.py b/lib/marin/src/marin/download/filesystem/__init__.py
deleted file mode 100644
index ec8bc038b7..0000000000
--- a/lib/marin/src/marin/download/filesystem/__init__.py
+++ /dev/null
@@ -1,2 +0,0 @@
-# Copyright The Marin Authors
-# SPDX-License-Identifier: Apache-2.0
diff --git a/lib/marin/src/marin/download/filesystem/transfer.py b/lib/marin/src/marin/download/filesystem/transfer.py
deleted file mode 100644
index 045a360623..0000000000
--- a/lib/marin/src/marin/download/filesystem/transfer.py
+++ /dev/null
@@ -1,5 +0,0 @@
-# Copyright The Marin Authors
-# SPDX-License-Identifier: Apache-2.0
-# Backward-compat shim. Canonical location: marin.datakit.download.filesystem
-
-from marin.datakit.download.filesystem import transfer_files as transfer_files
diff --git a/lib/marin/src/marin/download/huggingface/__init__.py b/lib/marin/src/marin/download/huggingface/__init__.py
deleted file mode 100644
index ec8bc038b7..0000000000
--- a/lib/marin/src/marin/download/huggingface/__init__.py
+++ /dev/null
@@ -1,2 +0,0 @@
-# Copyright The Marin Authors
-# SPDX-License-Identifier: Apache-2.0
diff --git a/lib/marin/src/marin/download/huggingface/download_hf.py b/lib/marin/src/marin/download/huggingface/download_hf.py
deleted file mode 100644
index 2dd0177806..0000000000
--- a/lib/marin/src/marin/download/huggingface/download_hf.py
+++ /dev/null
@@ -1,13 +0,0 @@
-# Copyright The Marin Authors
-# SPDX-License-Identifier: Apache-2.0
-# Backward-compat shim. Canonical location: marin.datakit.download.huggingface
-
-from marin.datakit.download.huggingface import DownloadConfig as DownloadConfig
-from marin.datakit.download.huggingface import _relative_path_in_source as _relative_path_in_source
-from marin.datakit.download.huggingface import download_hf as download_hf
-from marin.datakit.download.huggingface import ensure_fsspec_path_writable as ensure_fsspec_path_writable
-from marin.datakit.download.huggingface import main as main
-from marin.datakit.download.huggingface import stream_file_to_fsspec as stream_file_to_fsspec
-
-if __name__ == "__main__":
-    main()
diff --git a/lib/marin/src/marin/download/huggingface/stream_remove_columns.py b/lib/marin/src/marin/download/huggingface/stream_remove_columns.py
deleted file mode 100644
index 68a44db40c..0000000000
--- a/lib/marin/src/marin/download/huggingface/stream_remove_columns.py
+++ /dev/null
@@ -1,8 +0,0 @@
-# Copyright The Marin Authors
-# SPDX-License-Identifier: Apache-2.0
-# Backward-compat shim. Canonical location: marin.datakit.download.stream_remove_columns
-
-from marin.datakit.download.stream_remove_columns import DatasetConfig as DatasetConfig
-from marin.datakit.download.stream_remove_columns import get_file_tasks as get_file_tasks
-from marin.datakit.download.stream_remove_columns import prune_hf_dataset as prune_hf_dataset
-from marin.datakit.download.stream_remove_columns import prune_stream_and_save as prune_stream_and_save
diff --git a/lib/marin/src/marin/download/huggingface/upload_gcs_to_hf.py b/lib/marin/src/marin/download/huggingface/upload_gcs_to_hf.py
deleted file mode 100644
index 43c368f5b9..0000000000
--- a/lib/marin/src/marin/download/huggingface/upload_gcs_to_hf.py
+++ /dev/null
@@ -1,10 +0,0 @@
-# Copyright The Marin Authors
-# SPDX-License-Identifier: Apache-2.0
-# Backward-compat shim. Canonical location: marin.datakit.download.upload_gcs_to_hf
-
-from marin.datakit.download.upload_gcs_to_hf import UploadConfig as UploadConfig
-from marin.datakit.download.upload_gcs_to_hf import main as main
-from marin.datakit.download.upload_gcs_to_hf import upload_gcs_to_hf as upload_gcs_to_hf
-
-if __name__ == "__main__":
-    main()
diff --git a/lib/marin/src/marin/download/nemotron_cc/__init__.py b/lib/marin/src/marin/download/nemotron_cc/__init__.py
deleted file mode 100644
index ec8bc038b7..0000000000
--- a/lib/marin/src/marin/download/nemotron_cc/__init__.py
+++ /dev/null
@@ -1,2 +0,0 @@
-# Copyright The Marin Authors
-# SPDX-License-Identifier: Apache-2.0
diff --git a/lib/marin/src/marin/download/nemotron_cc/download_nemotron_cc.py b/lib/marin/src/marin/download/nemotron_cc/download_nemotron_cc.py
deleted file mode 100644
index c7e8e16e54..0000000000
--- a/lib/marin/src/marin/download/nemotron_cc/download_nemotron_cc.py
+++ /dev/null
@@ -1,6 +0,0 @@
-# Copyright The Marin Authors
-# SPDX-License-Identifier: Apache-2.0
-# Backward-compat shim. Canonical location: marin.datakit.download.nemotron_cc
-
-from marin.datakit.download.nemotron_cc import download_nemotron_cc as download_nemotron_cc
-from marin.datakit.download.nemotron_cc import download_single_nemotron_path as download_single_nemotron_path
diff --git a/lib/marin/src/marin/download/uncheatable_eval/__init__.py b/lib/marin/src/marin/download/uncheatable_eval/__init__.py
deleted file mode 100644
index ec8bc038b7..0000000000
--- a/lib/marin/src/marin/download/uncheatable_eval/__init__.py
+++ /dev/null
@@ -1,2 +0,0 @@
-# Copyright The Marin Authors
-# SPDX-License-Identifier: Apache-2.0
diff --git a/lib/marin/src/marin/download/uncheatable_eval/download.py b/lib/marin/src/marin/download/uncheatable_eval/download.py
deleted file mode 100644
index 9baf9db8ad..0000000000
--- a/lib/marin/src/marin/download/uncheatable_eval/download.py
+++ /dev/null
@@ -1,12 +0,0 @@
-# Copyright The Marin Authors
-# SPDX-License-Identifier: Apache-2.0
-# Backward-compat shim. Canonical location: marin.datakit.download.uncheatable_eval
-
-from marin.datakit.download.uncheatable_eval import UncheatableEvalDataset as UncheatableEvalDataset
-from marin.datakit.download.uncheatable_eval import (
-    UncheatableEvalDownloadConfig as UncheatableEvalDownloadConfig,
-)
-from marin.datakit.download.uncheatable_eval import (
-    download_latest_uncheatable_eval as download_latest_uncheatable_eval,
-)
-from marin.datakit.download.uncheatable_eval import make_uncheatable_eval_step as make_uncheatable_eval_step
diff --git a/lib/marin/src/marin/download/wikipedia/__init__.py b/lib/marin/src/marin/download/wikipedia/__init__.py
deleted file mode 100644
index ec8bc038b7..0000000000
--- a/lib/marin/src/marin/download/wikipedia/__init__.py
+++ /dev/null
@@ -1,2 +0,0 @@
-# Copyright The Marin Authors
-# SPDX-License-Identifier: Apache-2.0
diff --git a/lib/marin/src/marin/download/wikipedia/download.py b/lib/marin/src/marin/download/wikipedia/download.py
deleted file mode 100644
index 9b50143040..0000000000
--- a/lib/marin/src/marin/download/wikipedia/download.py
+++ /dev/null
@@ -1,7 +0,0 @@
-# Copyright The Marin Authors
-# SPDX-License-Identifier: Apache-2.0
-# Backward-compat shim. Canonical location: marin.datakit.download.wikipedia
-
-from marin.datakit.download.wikipedia import download as download
-from marin.datakit.download.wikipedia import download_tar as download_tar
-from marin.datakit.download.wikipedia import process_file as process_file

From f4f7cabf9a34880a519641fa1a20fdd506dc1581 Mon Sep 17 00:00:00 2001
From: Rafal Wojdyla <ravwojdyla@gmail.com>
Date: Wed, 25 Mar 2026 14:36:43 -0700
Subject: [PATCH 16/56] Remove unused stackexchange data files

The TSV and README were not referenced by any code.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .../download/data/stackexchange/README.md     |  20 --
 .../data/stackexchange/stackexchange-urls.tsv | 183 ------------------
 2 files changed, 203 deletions(-)
 delete mode 100644 lib/marin/src/marin/datakit/download/data/stackexchange/README.md
 delete mode 100644 lib/marin/src/marin/datakit/download/data/stackexchange/stackexchange-urls.tsv

diff --git a/lib/marin/src/marin/datakit/download/data/stackexchange/README.md b/lib/marin/src/marin/datakit/download/data/stackexchange/README.md
deleted file mode 100644
index 295232a502..0000000000
--- a/lib/marin/src/marin/datakit/download/data/stackexchange/README.md
+++ /dev/null
@@ -1,20 +0,0 @@
-# Downloading Stackexchange Data
-
-Raw StackExchange dumps are available at https://archive.org/download/stackexchange. We use the dump from 2024-04-02.
-We exclude "meta" sites and only use the main sites (i.e., we use "3dprinting.stackexchange.com.7z" but don't use
-"3dprinting.meta.stackexchange.com.7z"). The full dump is approximately 100 GB.
-
-**Downloading Data to GCS**: To get the raw data, we use the GCS Storage Transfer Service to perform the data transfer.
-To kick off the job, create `stackexchange-urls.tsv` using the following instructions (per @dlwh):
-
-- Go to `[https://archive.org/details/stackexchange](https://archive.org/details/stackexchange)`
-- Expand the `7z` sidebar, copy all the names (w/ mouse)
-- Paste into a text editor (i.e., VSCode)
-- Run (sequence of find/replace commands - regex mode)
-  + Remove all " download" strings -- match on `download `
-  + Remove all file sizes (e.g., 188M) -- match on `^\d.*?\d[KMG]`
-  + Remove all `meta` sites -- match on `.*\.meta\..*\n`
-  + Prepend URL Prefix `https://archive.org/download/stackexchange/` to each line
-  + Insert `TsvHttpData-1.0` on the first line
-
-Pass this file to the Storage Transfer Job CLI to kick off the transfer.
diff --git a/lib/marin/src/marin/datakit/download/data/stackexchange/stackexchange-urls.tsv b/lib/marin/src/marin/datakit/download/data/stackexchange/stackexchange-urls.tsv
deleted file mode 100644
index 763e0341da..0000000000
--- a/lib/marin/src/marin/datakit/download/data/stackexchange/stackexchange-urls.tsv
+++ /dev/null
@@ -1,183 +0,0 @@
-TsvHttpData-1.0
-https://archive.org/download/stackexchange/3dprinting.stackexchange.com.7z
-https://archive.org/download/stackexchange/academia.stackexchange.com.7z
-https://archive.org/download/stackexchange/ai.stackexchange.com.7z
-https://archive.org/download/stackexchange/android.stackexchange.com.7z
-https://archive.org/download/stackexchange/anime.stackexchange.com.7z
-https://archive.org/download/stackexchange/apple.stackexchange.com.7z
-https://archive.org/download/stackexchange/arduino.stackexchange.com.7z
-https://archive.org/download/stackexchange/askubuntu.com.7z
-https://archive.org/download/stackexchange/astronomy.stackexchange.com.7z
-https://archive.org/download/stackexchange/aviation.stackexchange.com.7z
-https://archive.org/download/stackexchange/avp.stackexchange.com.7z
-https://archive.org/download/stackexchange/beer.stackexchange.com.7z
-https://archive.org/download/stackexchange/bicycles.stackexchange.com.7z
-https://archive.org/download/stackexchange/bioacoustics.stackexchange.com.7z
-https://archive.org/download/stackexchange/bioinformatics.stackexchange.com.7z
-https://archive.org/download/stackexchange/biology.stackexchange.com.7z
-https://archive.org/download/stackexchange/bitcoin.stackexchange.com.7z
-https://archive.org/download/stackexchange/blender.stackexchange.com.7z
-https://archive.org/download/stackexchange/boardgames.stackexchange.com.7z
-https://archive.org/download/stackexchange/bricks.stackexchange.com.7z
-https://archive.org/download/stackexchange/buddhism.stackexchange.com.7z
-https://archive.org/download/stackexchange/cardano.stackexchange.com.7z
-https://archive.org/download/stackexchange/chemistry.stackexchange.com.7z
-https://archive.org/download/stackexchange/chess.stackexchange.com.7z
-https://archive.org/download/stackexchange/chinese.stackexchange.com.7z
-https://archive.org/download/stackexchange/christianity.stackexchange.com.7z
-https://archive.org/download/stackexchange/civicrm.stackexchange.com.7z
-https://archive.org/download/stackexchange/codegolf.stackexchange.com.7z
-https://archive.org/download/stackexchange/codereview.stackexchange.com.7z
-https://archive.org/download/stackexchange/coffee.stackexchange.com.7z
-https://archive.org/download/stackexchange/cogsci.stackexchange.com.7z
-https://archive.org/download/stackexchange/computergraphics.stackexchange.com.7z
-https://archive.org/download/stackexchange/conlang.stackexchange.com.7z
-https://archive.org/download/stackexchange/cooking.stackexchange.com.7z
-https://archive.org/download/stackexchange/craftcms.stackexchange.com.7z
-https://archive.org/download/stackexchange/crafts.stackexchange.com.7z
-https://archive.org/download/stackexchange/crypto.stackexchange.com.7z
-https://archive.org/download/stackexchange/cs.stackexchange.com.7z
-https://archive.org/download/stackexchange/cseducators.stackexchange.com.7z
-https://archive.org/download/stackexchange/cstheory.stackexchange.com.7z
-https://archive.org/download/stackexchange/datascience.stackexchange.com.7z
-https://archive.org/download/stackexchange/dba.stackexchange.com.7z
-https://archive.org/download/stackexchange/devops.stackexchange.com.7z
-https://archive.org/download/stackexchange/diy.stackexchange.com.7z
-https://archive.org/download/stackexchange/drones.stackexchange.com.7z
-https://archive.org/download/stackexchange/drupal.stackexchange.com.7z
-https://archive.org/download/stackexchange/dsp.stackexchange.com.7z
-https://archive.org/download/stackexchange/earthscience.stackexchange.com.7z
-https://archive.org/download/stackexchange/ebooks.stackexchange.com.7z
-https://archive.org/download/stackexchange/economics.stackexchange.com.7z
-https://archive.org/download/stackexchange/electronics.stackexchange.com.7z
-https://archive.org/download/stackexchange/elementaryos.stackexchange.com.7z
-https://archive.org/download/stackexchange/ell.stackexchange.com.7z
-https://archive.org/download/stackexchange/emacs.stackexchange.com.7z
-https://archive.org/download/stackexchange/engineering.stackexchange.com.7z
-https://archive.org/download/stackexchange/english.stackexchange.com.7z
-https://archive.org/download/stackexchange/eosio.stackexchange.com.7z
-https://archive.org/download/stackexchange/es.stackoverflow.com.7z
-https://archive.org/download/stackexchange/esperanto.stackexchange.com.7z
-https://archive.org/download/stackexchange/ethereum.stackexchange.com.7z
-https://archive.org/download/stackexchange/expatriates.stackexchange.com.7z
-https://archive.org/download/stackexchange/expressionengine.stackexchange.com.7z
-https://archive.org/download/stackexchange/fitness.stackexchange.com.7z
-https://archive.org/download/stackexchange/freelancing.stackexchange.com.7z
-https://archive.org/download/stackexchange/french.stackexchange.com.7z
-https://archive.org/download/stackexchange/gamedev.stackexchange.com.7z
-https://archive.org/download/stackexchange/gaming.stackexchange.com.7z
-https://archive.org/download/stackexchange/gardening.stackexchange.com.7z
-https://archive.org/download/stackexchange/genai.stackexchange.com.7z
-https://archive.org/download/stackexchange/genealogy.stackexchange.com.7z
-https://archive.org/download/stackexchange/german.stackexchange.com.7z
-https://archive.org/download/stackexchange/gis.stackexchange.com.7z
-https://archive.org/download/stackexchange/graphicdesign.stackexchange.com.7z
-https://archive.org/download/stackexchange/ham.stackexchange.com.7z
-https://archive.org/download/stackexchange/hardwarerecs.stackexchange.com.7z
-https://archive.org/download/stackexchange/health.stackexchange.com.7z
-https://archive.org/download/stackexchange/hermeneutics.stackexchange.com.7z
-https://archive.org/download/stackexchange/hinduism.stackexchange.com.7z
-https://archive.org/download/stackexchange/history.stackexchange.com.7z
-https://archive.org/download/stackexchange/homebrew.stackexchange.com.7z
-https://archive.org/download/stackexchange/hsm.stackexchange.com.7z
-https://archive.org/download/stackexchange/interpersonal.stackexchange.com.7z
-https://archive.org/download/stackexchange/iot.stackexchange.com.7z
-https://archive.org/download/stackexchange/iota.stackexchange.com.7z
-https://archive.org/download/stackexchange/islam.stackexchange.com.7z
-https://archive.org/download/stackexchange/italian.stackexchange.com.7z
-https://archive.org/download/stackexchange/ja.stackoverflow.com.7z
-https://archive.org/download/stackexchange/japanese.stackexchange.com.7z
-https://archive.org/download/stackexchange/joomla.stackexchange.com.7z
-https://archive.org/download/stackexchange/judaism.stackexchange.com.7z
-https://archive.org/download/stackexchange/korean.stackexchange.com.7z
-https://archive.org/download/stackexchange/langdev.stackexchange.com.7z
-https://archive.org/download/stackexchange/languagelearning.stackexchange.com.7z
-https://archive.org/download/stackexchange/latin.stackexchange.com.7z
-https://archive.org/download/stackexchange/law.stackexchange.com.7z
-https://archive.org/download/stackexchange/lifehacks.stackexchange.com.7z
-https://archive.org/download/stackexchange/linguistics.stackexchange.com.7z
-https://archive.org/download/stackexchange/literature.stackexchange.com.7z
-https://archive.org/download/stackexchange/magento.stackexchange.com.7z
-https://archive.org/download/stackexchange/martialarts.stackexchange.com.7z
-https://archive.org/download/stackexchange/materials.stackexchange.com.7z
-https://archive.org/download/stackexchange/math.stackexchange.com.7z
-https://archive.org/download/stackexchange/matheducators.stackexchange.com.7z
-https://archive.org/download/stackexchange/mathematica.stackexchange.com.7z
-https://archive.org/download/stackexchange/mathoverflow.net.7z
-https://archive.org/download/stackexchange/mechanics.stackexchange.com.7z
-https://archive.org/download/stackexchange/moderators.stackexchange.com.7z
-https://archive.org/download/stackexchange/monero.stackexchange.com.7z
-https://archive.org/download/stackexchange/money.stackexchange.com.7z
-https://archive.org/download/stackexchange/movies.stackexchange.com.7z
-https://archive.org/download/stackexchange/music.stackexchange.com.7z
-https://archive.org/download/stackexchange/musicfans.stackexchange.com.7z
-https://archive.org/download/stackexchange/mythology.stackexchange.com.7z
-https://archive.org/download/stackexchange/networkengineering.stackexchange.com.7z
-https://archive.org/download/stackexchange/opendata.stackexchange.com.7z
-https://archive.org/download/stackexchange/opensource.stackexchange.com.7z
-https://archive.org/download/stackexchange/or.stackexchange.com.7z
-https://archive.org/download/stackexchange/outdoors.stackexchange.com.7z
-https://archive.org/download/stackexchange/parenting.stackexchange.com.7z
-https://archive.org/download/stackexchange/patents.stackexchange.com.7z
-https://archive.org/download/stackexchange/pets.stackexchange.com.7z
-https://archive.org/download/stackexchange/philosophy.stackexchange.com.7z
-https://archive.org/download/stackexchange/photo.stackexchange.com.7z
-https://archive.org/download/stackexchange/physics.stackexchange.com.7z
-https://archive.org/download/stackexchange/pm.stackexchange.com.7z
-https://archive.org/download/stackexchange/poker.stackexchange.com.7z
-https://archive.org/download/stackexchange/politics.stackexchange.com.7z
-https://archive.org/download/stackexchange/portuguese.stackexchange.com.7z
-https://archive.org/download/stackexchange/proofassistants.stackexchange.com.7z
-https://archive.org/download/stackexchange/pt.stackoverflow.com.7z
-https://archive.org/download/stackexchange/puzzling.stackexchange.com.7z
-https://archive.org/download/stackexchange/quant.stackexchange.com.7z
-https://archive.org/download/stackexchange/quantumcomputing.stackexchange.com.7z
-https://archive.org/download/stackexchange/raspberrypi.stackexchange.com.7z
-https://archive.org/download/stackexchange/retrocomputing.stackexchange.com.7z
-https://archive.org/download/stackexchange/reverseengineering.stackexchange.com.7z
-https://archive.org/download/stackexchange/robotics.stackexchange.com.7z
-https://archive.org/download/stackexchange/rpg.stackexchange.com.7z
-https://archive.org/download/stackexchange/ru.stackoverflow.com.7z
-https://archive.org/download/stackexchange/rus.stackexchange.com.7z
-https://archive.org/download/stackexchange/russian.stackexchange.com.7z
-https://archive.org/download/stackexchange/salesforce.stackexchange.com.7z
-https://archive.org/download/stackexchange/scicomp.stackexchange.com.7z
-https://archive.org/download/stackexchange/scifi.stackexchange.com.7z
-https://archive.org/download/stackexchange/security.stackexchange.com.7z
-https://archive.org/download/stackexchange/serverfault.com.7z
-https://archive.org/download/stackexchange/sharepoint.stackexchange.com.7z
-https://archive.org/download/stackexchange/sitecore.stackexchange.com.7z
-https://archive.org/download/stackexchange/skeptics.stackexchange.com.7z
-https://archive.org/download/stackexchange/softwareengineering.stackexchange.com.7z
-https://archive.org/download/stackexchange/softwarerecs.stackexchange.com.7z
-https://archive.org/download/stackexchange/solana.stackexchange.com.7z
-https://archive.org/download/stackexchange/sound.stackexchange.com.7z
-https://archive.org/download/stackexchange/space.stackexchange.com.7z
-https://archive.org/download/stackexchange/spanish.stackexchange.com.7z
-https://archive.org/download/stackexchange/sports.stackexchange.com.7z
-https://archive.org/download/stackexchange/sqa.stackexchange.com.7z
-https://archive.org/download/stackexchange/stackapps.com.7z
-https://archive.org/download/stackexchange/stackoverflow.com-Posts.7z
-https://archive.org/download/stackexchange/stats.stackexchange.com.7z
-https://archive.org/download/stackexchange/stellar.stackexchange.com.7z
-https://archive.org/download/stackexchange/substrate.stackexchange.com.7z
-https://archive.org/download/stackexchange/superuser.com.7z
-https://archive.org/download/stackexchange/sustainability.stackexchange.com.7z
-https://archive.org/download/stackexchange/tex.stackexchange.com.7z
-https://archive.org/download/stackexchange/tezos.stackexchange.com.7z
-https://archive.org/download/stackexchange/tor.stackexchange.com.7z
-https://archive.org/download/stackexchange/travel.stackexchange.com.7z
-https://archive.org/download/stackexchange/tridion.stackexchange.com.7z
-https://archive.org/download/stackexchange/ukrainian.stackexchange.com.7z
-https://archive.org/download/stackexchange/unix.stackexchange.com.7z
-https://archive.org/download/stackexchange/ux.stackexchange.com.7z
-https://archive.org/download/stackexchange/vegetarianism.stackexchange.com.7z
-https://archive.org/download/stackexchange/vi.stackexchange.com.7z
-https://archive.org/download/stackexchange/webapps.stackexchange.com.7z
-https://archive.org/download/stackexchange/webmasters.stackexchange.com.7z
-https://archive.org/download/stackexchange/windowsphone.stackexchange.com.7z
-https://archive.org/download/stackexchange/woodworking.stackexchange.com.7z
-https://archive.org/download/stackexchange/wordpress.stackexchange.com.7z
-https://archive.org/download/stackexchange/workplace.stackexchange.com.7z
-https://archive.org/download/stackexchange/worldbuilding.stackexchange.com.7z
-https://archive.org/download/stackexchange/writers.stackexchange.com.7z

From 64fd456507e6f29c015323247a71007b5f41aad9 Mon Sep 17 00:00:00 2001
From: Rafal Wojdyla <ravwojdyla@gmail.com>
Date: Wed, 25 Mar 2026 14:37:54 -0700
Subject: [PATCH 17/56] Revert "Remove unused stackexchange data files"

This reverts commit ced1f4f307809d14f4e50444c943cf865d3e39fa.
---
 .../download/data/stackexchange/README.md     |  20 ++
 .../data/stackexchange/stackexchange-urls.tsv | 183 ++++++++++++++++++
 2 files changed, 203 insertions(+)
 create mode 100644 lib/marin/src/marin/datakit/download/data/stackexchange/README.md
 create mode 100644 lib/marin/src/marin/datakit/download/data/stackexchange/stackexchange-urls.tsv

diff --git a/lib/marin/src/marin/datakit/download/data/stackexchange/README.md b/lib/marin/src/marin/datakit/download/data/stackexchange/README.md
new file mode 100644
index 0000000000..295232a502
--- /dev/null
+++ b/lib/marin/src/marin/datakit/download/data/stackexchange/README.md
@@ -0,0 +1,20 @@
+# Downloading Stackexchange Data
+
+Raw StackExchange dumps are available at https://archive.org/download/stackexchange. We use the dump from 2024-04-02.
+We exclude "meta" sites and only use the main sites (i.e., we use "3dprinting.stackexchange.com.7z" but don't use
+"3dprinting.meta.stackexchange.com.7z"). The full dump is approximately 100 GB.
+
+**Downloading Data to GCS**: To get the raw data, we use the GCS Storage Transfer Service to perform the data transfer.
+To kick off the job, create `stackexchange-urls.tsv` using the following instructions (per @dlwh):
+
+- Go to `[https://archive.org/details/stackexchange](https://archive.org/details/stackexchange)`
+- Expand the `7z` sidebar, copy all the names (w/ mouse)
+- Paste into a text editor (i.e., VSCode)
+- Run (sequence of find/replace commands - regex mode)
+  + Remove all " download" strings -- match on `download `
+  + Remove all file sizes (e.g., 188M) -- match on `^\d.*?\d[KMG]`
+  + Remove all `meta` sites -- match on `.*\.meta\..*\n`
+  + Prepend URL Prefix `https://archive.org/download/stackexchange/` to each line
+  + Insert `TsvHttpData-1.0` on the first line
+
+Pass this file to the Storage Transfer Job CLI to kick off the transfer.
diff --git a/lib/marin/src/marin/datakit/download/data/stackexchange/stackexchange-urls.tsv b/lib/marin/src/marin/datakit/download/data/stackexchange/stackexchange-urls.tsv
new file mode 100644
index 0000000000..763e0341da
--- /dev/null
+++ b/lib/marin/src/marin/datakit/download/data/stackexchange/stackexchange-urls.tsv
@@ -0,0 +1,183 @@
+TsvHttpData-1.0
+https://archive.org/download/stackexchange/3dprinting.stackexchange.com.7z
+https://archive.org/download/stackexchange/academia.stackexchange.com.7z
+https://archive.org/download/stackexchange/ai.stackexchange.com.7z
+https://archive.org/download/stackexchange/android.stackexchange.com.7z
+https://archive.org/download/stackexchange/anime.stackexchange.com.7z
+https://archive.org/download/stackexchange/apple.stackexchange.com.7z
+https://archive.org/download/stackexchange/arduino.stackexchange.com.7z
+https://archive.org/download/stackexchange/askubuntu.com.7z
+https://archive.org/download/stackexchange/astronomy.stackexchange.com.7z
+https://archive.org/download/stackexchange/aviation.stackexchange.com.7z
+https://archive.org/download/stackexchange/avp.stackexchange.com.7z
+https://archive.org/download/stackexchange/beer.stackexchange.com.7z
+https://archive.org/download/stackexchange/bicycles.stackexchange.com.7z
+https://archive.org/download/stackexchange/bioacoustics.stackexchange.com.7z
+https://archive.org/download/stackexchange/bioinformatics.stackexchange.com.7z
+https://archive.org/download/stackexchange/biology.stackexchange.com.7z
+https://archive.org/download/stackexchange/bitcoin.stackexchange.com.7z
+https://archive.org/download/stackexchange/blender.stackexchange.com.7z
+https://archive.org/download/stackexchange/boardgames.stackexchange.com.7z
+https://archive.org/download/stackexchange/bricks.stackexchange.com.7z
+https://archive.org/download/stackexchange/buddhism.stackexchange.com.7z
+https://archive.org/download/stackexchange/cardano.stackexchange.com.7z
+https://archive.org/download/stackexchange/chemistry.stackexchange.com.7z
+https://archive.org/download/stackexchange/chess.stackexchange.com.7z
+https://archive.org/download/stackexchange/chinese.stackexchange.com.7z
+https://archive.org/download/stackexchange/christianity.stackexchange.com.7z
+https://archive.org/download/stackexchange/civicrm.stackexchange.com.7z
+https://archive.org/download/stackexchange/codegolf.stackexchange.com.7z
+https://archive.org/download/stackexchange/codereview.stackexchange.com.7z
+https://archive.org/download/stackexchange/coffee.stackexchange.com.7z
+https://archive.org/download/stackexchange/cogsci.stackexchange.com.7z
+https://archive.org/download/stackexchange/computergraphics.stackexchange.com.7z
+https://archive.org/download/stackexchange/conlang.stackexchange.com.7z
+https://archive.org/download/stackexchange/cooking.stackexchange.com.7z
+https://archive.org/download/stackexchange/craftcms.stackexchange.com.7z
+https://archive.org/download/stackexchange/crafts.stackexchange.com.7z
+https://archive.org/download/stackexchange/crypto.stackexchange.com.7z
+https://archive.org/download/stackexchange/cs.stackexchange.com.7z
+https://archive.org/download/stackexchange/cseducators.stackexchange.com.7z
+https://archive.org/download/stackexchange/cstheory.stackexchange.com.7z
+https://archive.org/download/stackexchange/datascience.stackexchange.com.7z
+https://archive.org/download/stackexchange/dba.stackexchange.com.7z
+https://archive.org/download/stackexchange/devops.stackexchange.com.7z
+https://archive.org/download/stackexchange/diy.stackexchange.com.7z
+https://archive.org/download/stackexchange/drones.stackexchange.com.7z
+https://archive.org/download/stackexchange/drupal.stackexchange.com.7z
+https://archive.org/download/stackexchange/dsp.stackexchange.com.7z
+https://archive.org/download/stackexchange/earthscience.stackexchange.com.7z
+https://archive.org/download/stackexchange/ebooks.stackexchange.com.7z
+https://archive.org/download/stackexchange/economics.stackexchange.com.7z
+https://archive.org/download/stackexchange/electronics.stackexchange.com.7z
+https://archive.org/download/stackexchange/elementaryos.stackexchange.com.7z
+https://archive.org/download/stackexchange/ell.stackexchange.com.7z
+https://archive.org/download/stackexchange/emacs.stackexchange.com.7z
+https://archive.org/download/stackexchange/engineering.stackexchange.com.7z
+https://archive.org/download/stackexchange/english.stackexchange.com.7z
+https://archive.org/download/stackexchange/eosio.stackexchange.com.7z
+https://archive.org/download/stackexchange/es.stackoverflow.com.7z
+https://archive.org/download/stackexchange/esperanto.stackexchange.com.7z
+https://archive.org/download/stackexchange/ethereum.stackexchange.com.7z
+https://archive.org/download/stackexchange/expatriates.stackexchange.com.7z
+https://archive.org/download/stackexchange/expressionengine.stackexchange.com.7z
+https://archive.org/download/stackexchange/fitness.stackexchange.com.7z
+https://archive.org/download/stackexchange/freelancing.stackexchange.com.7z
+https://archive.org/download/stackexchange/french.stackexchange.com.7z
+https://archive.org/download/stackexchange/gamedev.stackexchange.com.7z
+https://archive.org/download/stackexchange/gaming.stackexchange.com.7z
+https://archive.org/download/stackexchange/gardening.stackexchange.com.7z
+https://archive.org/download/stackexchange/genai.stackexchange.com.7z
+https://archive.org/download/stackexchange/genealogy.stackexchange.com.7z
+https://archive.org/download/stackexchange/german.stackexchange.com.7z
+https://archive.org/download/stackexchange/gis.stackexchange.com.7z
+https://archive.org/download/stackexchange/graphicdesign.stackexchange.com.7z
+https://archive.org/download/stackexchange/ham.stackexchange.com.7z
+https://archive.org/download/stackexchange/hardwarerecs.stackexchange.com.7z
+https://archive.org/download/stackexchange/health.stackexchange.com.7z
+https://archive.org/download/stackexchange/hermeneutics.stackexchange.com.7z
+https://archive.org/download/stackexchange/hinduism.stackexchange.com.7z
+https://archive.org/download/stackexchange/history.stackexchange.com.7z
+https://archive.org/download/stackexchange/homebrew.stackexchange.com.7z
+https://archive.org/download/stackexchange/hsm.stackexchange.com.7z
+https://archive.org/download/stackexchange/interpersonal.stackexchange.com.7z
+https://archive.org/download/stackexchange/iot.stackexchange.com.7z
+https://archive.org/download/stackexchange/iota.stackexchange.com.7z
+https://archive.org/download/stackexchange/islam.stackexchange.com.7z
+https://archive.org/download/stackexchange/italian.stackexchange.com.7z
+https://archive.org/download/stackexchange/ja.stackoverflow.com.7z
+https://archive.org/download/stackexchange/japanese.stackexchange.com.7z
+https://archive.org/download/stackexchange/joomla.stackexchange.com.7z
+https://archive.org/download/stackexchange/judaism.stackexchange.com.7z
+https://archive.org/download/stackexchange/korean.stackexchange.com.7z
+https://archive.org/download/stackexchange/langdev.stackexchange.com.7z
+https://archive.org/download/stackexchange/languagelearning.stackexchange.com.7z
+https://archive.org/download/stackexchange/latin.stackexchange.com.7z
+https://archive.org/download/stackexchange/law.stackexchange.com.7z
+https://archive.org/download/stackexchange/lifehacks.stackexchange.com.7z
+https://archive.org/download/stackexchange/linguistics.stackexchange.com.7z
+https://archive.org/download/stackexchange/literature.stackexchange.com.7z
+https://archive.org/download/stackexchange/magento.stackexchange.com.7z
+https://archive.org/download/stackexchange/martialarts.stackexchange.com.7z
+https://archive.org/download/stackexchange/materials.stackexchange.com.7z
+https://archive.org/download/stackexchange/math.stackexchange.com.7z
+https://archive.org/download/stackexchange/matheducators.stackexchange.com.7z
+https://archive.org/download/stackexchange/mathematica.stackexchange.com.7z
+https://archive.org/download/stackexchange/mathoverflow.net.7z
+https://archive.org/download/stackexchange/mechanics.stackexchange.com.7z
+https://archive.org/download/stackexchange/moderators.stackexchange.com.7z
+https://archive.org/download/stackexchange/monero.stackexchange.com.7z
+https://archive.org/download/stackexchange/money.stackexchange.com.7z
+https://archive.org/download/stackexchange/movies.stackexchange.com.7z
+https://archive.org/download/stackexchange/music.stackexchange.com.7z
+https://archive.org/download/stackexchange/musicfans.stackexchange.com.7z
+https://archive.org/download/stackexchange/mythology.stackexchange.com.7z
+https://archive.org/download/stackexchange/networkengineering.stackexchange.com.7z
+https://archive.org/download/stackexchange/opendata.stackexchange.com.7z
+https://archive.org/download/stackexchange/opensource.stackexchange.com.7z
+https://archive.org/download/stackexchange/or.stackexchange.com.7z
+https://archive.org/download/stackexchange/outdoors.stackexchange.com.7z
+https://archive.org/download/stackexchange/parenting.stackexchange.com.7z
+https://archive.org/download/stackexchange/patents.stackexchange.com.7z
+https://archive.org/download/stackexchange/pets.stackexchange.com.7z
+https://archive.org/download/stackexchange/philosophy.stackexchange.com.7z
+https://archive.org/download/stackexchange/photo.stackexchange.com.7z
+https://archive.org/download/stackexchange/physics.stackexchange.com.7z
+https://archive.org/download/stackexchange/pm.stackexchange.com.7z
+https://archive.org/download/stackexchange/poker.stackexchange.com.7z
+https://archive.org/download/stackexchange/politics.stackexchange.com.7z
+https://archive.org/download/stackexchange/portuguese.stackexchange.com.7z
+https://archive.org/download/stackexchange/proofassistants.stackexchange.com.7z
+https://archive.org/download/stackexchange/pt.stackoverflow.com.7z
+https://archive.org/download/stackexchange/puzzling.stackexchange.com.7z
+https://archive.org/download/stackexchange/quant.stackexchange.com.7z
+https://archive.org/download/stackexchange/quantumcomputing.stackexchange.com.7z
+https://archive.org/download/stackexchange/raspberrypi.stackexchange.com.7z
+https://archive.org/download/stackexchange/retrocomputing.stackexchange.com.7z
+https://archive.org/download/stackexchange/reverseengineering.stackexchange.com.7z
+https://archive.org/download/stackexchange/robotics.stackexchange.com.7z
+https://archive.org/download/stackexchange/rpg.stackexchange.com.7z
+https://archive.org/download/stackexchange/ru.stackoverflow.com.7z
+https://archive.org/download/stackexchange/rus.stackexchange.com.7z
+https://archive.org/download/stackexchange/russian.stackexchange.com.7z
+https://archive.org/download/stackexchange/salesforce.stackexchange.com.7z
+https://archive.org/download/stackexchange/scicomp.stackexchange.com.7z
+https://archive.org/download/stackexchange/scifi.stackexchange.com.7z
+https://archive.org/download/stackexchange/security.stackexchange.com.7z
+https://archive.org/download/stackexchange/serverfault.com.7z
+https://archive.org/download/stackexchange/sharepoint.stackexchange.com.7z
+https://archive.org/download/stackexchange/sitecore.stackexchange.com.7z
+https://archive.org/download/stackexchange/skeptics.stackexchange.com.7z
+https://archive.org/download/stackexchange/softwareengineering.stackexchange.com.7z
+https://archive.org/download/stackexchange/softwarerecs.stackexchange.com.7z
+https://archive.org/download/stackexchange/solana.stackexchange.com.7z
+https://archive.org/download/stackexchange/sound.stackexchange.com.7z
+https://archive.org/download/stackexchange/space.stackexchange.com.7z
+https://archive.org/download/stackexchange/spanish.stackexchange.com.7z
+https://archive.org/download/stackexchange/sports.stackexchange.com.7z
+https://archive.org/download/stackexchange/sqa.stackexchange.com.7z
+https://archive.org/download/stackexchange/stackapps.com.7z
+https://archive.org/download/stackexchange/stackoverflow.com-Posts.7z
+https://archive.org/download/stackexchange/stats.stackexchange.com.7z
+https://archive.org/download/stackexchange/stellar.stackexchange.com.7z
+https://archive.org/download/stackexchange/substrate.stackexchange.com.7z
+https://archive.org/download/stackexchange/superuser.com.7z
+https://archive.org/download/stackexchange/sustainability.stackexchange.com.7z
+https://archive.org/download/stackexchange/tex.stackexchange.com.7z
+https://archive.org/download/stackexchange/tezos.stackexchange.com.7z
+https://archive.org/download/stackexchange/tor.stackexchange.com.7z
+https://archive.org/download/stackexchange/travel.stackexchange.com.7z
+https://archive.org/download/stackexchange/tridion.stackexchange.com.7z
+https://archive.org/download/stackexchange/ukrainian.stackexchange.com.7z
+https://archive.org/download/stackexchange/unix.stackexchange.com.7z
+https://archive.org/download/stackexchange/ux.stackexchange.com.7z
+https://archive.org/download/stackexchange/vegetarianism.stackexchange.com.7z
+https://archive.org/download/stackexchange/vi.stackexchange.com.7z
+https://archive.org/download/stackexchange/webapps.stackexchange.com.7z
+https://archive.org/download/stackexchange/webmasters.stackexchange.com.7z
+https://archive.org/download/stackexchange/windowsphone.stackexchange.com.7z
+https://archive.org/download/stackexchange/woodworking.stackexchange.com.7z
+https://archive.org/download/stackexchange/wordpress.stackexchange.com.7z
+https://archive.org/download/stackexchange/workplace.stackexchange.com.7z
+https://archive.org/download/stackexchange/worldbuilding.stackexchange.com.7z
+https://archive.org/download/stackexchange/writers.stackexchange.com.7z

From d603b252581ffe1a12ecc8b25d9b542231908aab Mon Sep 17 00:00:00 2001
From: Rafal Wojdyla <ravwojdyla@gmail.com>
Date: Wed, 25 Mar 2026 14:41:35 -0700
Subject: [PATCH 18/56] Move upload_gcs_to_hf from datakit/download/ to
 utilities/
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Not a download step — it uploads checkpoints from GCS to HuggingFace.
Belongs in utilities. Updates the one consumer (exp1063_upload_tootsie).

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 experiments/tootsie/exp1063_upload_tootsie.py                   | 2 +-
 .../marin/{datakit/download => utilities}/upload_gcs_to_hf.py   | 0
 2 files changed, 1 insertion(+), 1 deletion(-)
 rename lib/marin/src/marin/{datakit/download => utilities}/upload_gcs_to_hf.py (100%)

diff --git a/experiments/tootsie/exp1063_upload_tootsie.py b/experiments/tootsie/exp1063_upload_tootsie.py
index d12aa5e060..c23d5de683 100644
--- a/experiments/tootsie/exp1063_upload_tootsie.py
+++ b/experiments/tootsie/exp1063_upload_tootsie.py
@@ -25,7 +25,7 @@
 
 from dataclasses import dataclass, field
 
-from marin.datakit.download.upload_gcs_to_hf import UploadConfig, upload_gcs_to_hf
+from marin.utilities.upload_gcs_to_hf import UploadConfig, upload_gcs_to_hf
 from marin.execution.executor import ExecutorStep, executor_main
 
 
diff --git a/lib/marin/src/marin/datakit/download/upload_gcs_to_hf.py b/lib/marin/src/marin/utilities/upload_gcs_to_hf.py
similarity index 100%
rename from lib/marin/src/marin/datakit/download/upload_gcs_to_hf.py
rename to lib/marin/src/marin/utilities/upload_gcs_to_hf.py

From 642b6990e4ff370e65e39f32832c4a7e259d4cd3 Mon Sep 17 00:00:00 2001
From: Rafal Wojdyla <ravwojdyla@gmail.com>
Date: Wed, 25 Mar 2026 14:45:11 -0700
Subject: [PATCH 19/56] Convert ar5iv into a package with its JSON data file
 alongside
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Moves ar5iv.py → ar5iv/__init__.py and places ar5iv-v04-2024.json
in the same package directory. Removes the now-empty data/ directory.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .../src/marin/datakit/download/{ar5iv.py => ar5iv/__init__.py}    | 0
 .../marin/datakit/download/{data => ar5iv}/ar5iv-v04-2024.json    | 0
 .../src/marin/datakit/download/{data => }/stackexchange/README.md | 0
 .../download/{data => }/stackexchange/stackexchange-urls.tsv      | 0
 4 files changed, 0 insertions(+), 0 deletions(-)
 rename lib/marin/src/marin/datakit/download/{ar5iv.py => ar5iv/__init__.py} (100%)
 rename lib/marin/src/marin/datakit/download/{data => ar5iv}/ar5iv-v04-2024.json (100%)
 rename lib/marin/src/marin/datakit/download/{data => }/stackexchange/README.md (100%)
 rename lib/marin/src/marin/datakit/download/{data => }/stackexchange/stackexchange-urls.tsv (100%)

diff --git a/lib/marin/src/marin/datakit/download/ar5iv.py b/lib/marin/src/marin/datakit/download/ar5iv/__init__.py
similarity index 100%
rename from lib/marin/src/marin/datakit/download/ar5iv.py
rename to lib/marin/src/marin/datakit/download/ar5iv/__init__.py
diff --git a/lib/marin/src/marin/datakit/download/data/ar5iv-v04-2024.json b/lib/marin/src/marin/datakit/download/ar5iv/ar5iv-v04-2024.json
similarity index 100%
rename from lib/marin/src/marin/datakit/download/data/ar5iv-v04-2024.json
rename to lib/marin/src/marin/datakit/download/ar5iv/ar5iv-v04-2024.json
diff --git a/lib/marin/src/marin/datakit/download/data/stackexchange/README.md b/lib/marin/src/marin/datakit/download/stackexchange/README.md
similarity index 100%
rename from lib/marin/src/marin/datakit/download/data/stackexchange/README.md
rename to lib/marin/src/marin/datakit/download/stackexchange/README.md
diff --git a/lib/marin/src/marin/datakit/download/data/stackexchange/stackexchange-urls.tsv b/lib/marin/src/marin/datakit/download/stackexchange/stackexchange-urls.tsv
similarity index 100%
rename from lib/marin/src/marin/datakit/download/data/stackexchange/stackexchange-urls.tsv
rename to lib/marin/src/marin/datakit/download/stackexchange/stackexchange-urls.tsv

From 92420925f398704fd0425d0bc53375a0af61397f Mon Sep 17 00:00:00 2001
From: Rafal Wojdyla <ravwojdyla@gmail.com>
Date: Wed, 25 Mar 2026 14:45:31 -0700
Subject: [PATCH 20/56] Delete unused filesystem transfer module

transfer_step and transfer_files have zero consumers in the codebase.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .../src/marin/datakit/download/filesystem.py  | 81 -------------------
 1 file changed, 81 deletions(-)
 delete mode 100644 lib/marin/src/marin/datakit/download/filesystem.py

diff --git a/lib/marin/src/marin/datakit/download/filesystem.py b/lib/marin/src/marin/datakit/download/filesystem.py
deleted file mode 100644
index 7ace48ab38..0000000000
--- a/lib/marin/src/marin/datakit/download/filesystem.py
+++ /dev/null
@@ -1,81 +0,0 @@
-# Copyright The Marin Authors
-# SPDX-License-Identifier: Apache-2.0
-
-import logging
-import os
-import random
-import time
-
-from iris.marin_fs import url_to_fs
-from marin.execution.step_spec import StepSpec
-from zephyr import Dataset, ZephyrContext
-
-from marin.utils import fsspec_exists, fsspec_glob
-
-logger = logging.getLogger(__name__)
-
-
-def transfer_files(
-    input_path: str,
-    output_path: str,
-    *,
-    num_random_files: int | None = None,
-    filetype: str = "jsonl.zst",
-) -> None:
-    """Transfer files from input_path to output_path.
-
-    When num_random_files is None, copies all matching files.
-    When specified, randomly samples that many files.
-    """
-    input_path = input_path.rstrip("/")
-
-    logger.info("Transferring %s to %s", input_path, output_path)
-    start_time = time.time()
-    fs, _ = url_to_fs(input_path)
-    if not fs.exists(input_path):
-        raise FileNotFoundError(f"{input_path} does not exist.")
-
-    filenames = fsspec_glob(os.path.join(input_path, f"**/*.{filetype}"))
-
-    if num_random_files is not None:
-        random.seed(42)
-        random.shuffle(filenames)
-        filenames = filenames[:num_random_files]
-
-    def copy_file(filename: str) -> None:
-        output_filename = os.path.join(output_path, os.path.basename(filename))
-        if not fsspec_exists(output_filename):
-            fs.makedirs(output_path, exist_ok=True)
-            fs.copy(filename, output_filename)
-
-    pipeline = Dataset.from_list(filenames).map(copy_file)
-    ctx = ZephyrContext(name="fs-transfer")
-    ctx.execute(pipeline)
-
-    elapsed = time.time() - start_time
-    logger.info("Transferred %s to %s (%.1fs)", input_path, output_path, elapsed)
-
-
-def transfer_step(
-    name: str,
-    *,
-    input_path: str,
-    num_random_files: int | None = None,
-    filetype: str = "jsonl.zst",
-    deps: list[StepSpec] | None = None,
-    output_path_prefix: str | None = None,
-    override_output_path: str | None = None,
-) -> StepSpec:
-    """Create a StepSpec that transfers files between fsspec paths."""
-
-    def _run(output_path: str) -> None:
-        transfer_files(input_path, output_path, num_random_files=num_random_files, filetype=filetype)
-
-    return StepSpec(
-        name=name,
-        fn=_run,
-        deps=deps or [],
-        hash_attrs={"input_path": input_path, "num_random_files": num_random_files, "filetype": filetype},
-        output_path_prefix=output_path_prefix,
-        override_output_path=override_output_path,
-    )

From cd65de58d8d0dcf0ee56ecf16b6981a257fd9d95 Mon Sep 17 00:00:00 2001
From: Rafal Wojdyla <ravwojdyla@gmail.com>
Date: Wed, 25 Mar 2026 14:47:21 -0700
Subject: [PATCH 21/56] Move ar5iv logic from __init__.py to download.py within
 the package

Keeps __init__.py as a thin re-export layer, with the actual
implementation in ar5iv/download.py alongside the JSON data file.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .../marin/datakit/download/ar5iv/__init__.py  | 161 +-----------------
 .../marin/datakit/download/ar5iv/download.py  | 160 +++++++++++++++++
 2 files changed, 164 insertions(+), 157 deletions(-)
 create mode 100644 lib/marin/src/marin/datakit/download/ar5iv/download.py

diff --git a/lib/marin/src/marin/datakit/download/ar5iv/__init__.py b/lib/marin/src/marin/datakit/download/ar5iv/__init__.py
index 86498e12e1..5d820ef55f 100644
--- a/lib/marin/src/marin/datakit/download/ar5iv/__init__.py
+++ b/lib/marin/src/marin/datakit/download/ar5iv/__init__.py
@@ -1,160 +1,7 @@
 # Copyright The Marin Authors
 # SPDX-License-Identifier: Apache-2.0
 
-"""
-Download and process Ar5iv dataset from a zip file.
-
-Example Usage:
-uv run zephyr --backend=ray --max-parallelism=1000 --memory=10GB \
-    lib/marin/src/marin/download/ar5iv/download.py \
-    --input_path gs://bucket/ar5iv.zip \
-    --output_path gs://bucket/output
-"""
-
-import json
-import logging
-import zipfile
-from collections import defaultdict
-from dataclasses import dataclass
-
-import draccus
-from iris.marin_fs import open_url
-from marin.execution.step_spec import StepSpec
-from zephyr import Dataset, ZephyrContext
-from zephyr.writers import atomic_rename
-from iris.logging import configure_logging
-
-logger = logging.getLogger(__name__)
-
-
-@dataclass
-class Ar5ivDownloadConfig:
-    input_path: str
-    output_path: str
-    max_files: int | None = None  # Maximum number of shards to process
-
-
-def process_shard(shard_task: dict) -> dict:
-    """
-    Process a single shard by extracting its files from the zip in GCS and uploading the merged JSONL.
-
-    Args:
-        shard_task: Dict with keys 'input_path', 'output_path', 'shard_id', 'file_list'
-    """
-    input_path = shard_task["input_path"]
-    output_path = shard_task["output_path"]
-    shard_id = shard_task["shard_id"]
-    file_list = shard_task["file_list"]
-    gcs_path = f"{output_path}/{shard_id}.jsonl.gz"
-
-    with open_url(str(input_path), "rb") as f:
-        with zipfile.ZipFile(f) as zf:
-            with atomic_rename(gcs_path) as temp_path, open_url(temp_path, "wt", compression="gzip") as out_f:
-                for filename in file_list:
-                    with zf.open(filename, "r") as file_handle:
-                        content = file_handle.read()
-                        record = {
-                            "filename": filename,
-                            "format": "html",
-                            "content": content.decode("utf-8", errors="replace"),
-                        }
-                        print(json.dumps(record), file=out_f)
-
-            logger.info(f"Shard {shard_id} with {len(file_list)} files uploaded to {gcs_path}")
-            return {"shard_id": shard_id, "num_files": len(file_list), "output_path": gcs_path}
-
-
-def download(cfg: Ar5ivDownloadConfig) -> None:
-    """
-    Download and process Ar5iv dataset from a zip file in GCS.
-
-    This function can be called by the executor framework or used standalone.
-    """
-    logger.info("Starting transfer of Ar5iv dataset...")
-    logger.info(f"Source: {cfg.input_path}")
-
-    # Use fsspec+zipfile to list all files
-    with open_url(str(cfg.input_path), "rb") as f:
-        with zipfile.ZipFile(f) as zf:
-            all_files = zf.infolist()
-
-            # Group by shard directory
-            # We assume structure: something like: shard_id/.../file
-            # shard_id is derived from the second last component if files are nested.
-            # Adjust as needed if directory structure differs.
-            shard_dict = defaultdict(list)
-            for info in all_files:
-                if info.is_dir():
-                    continue
-                # E.g. path might look like: "003/something.html"
-                # Extract shard_id from the directory:
-                # Split by "/" and take the first part if we assume structure {shard_id}/file
-                parts = info.filename.strip("/").split("/")
-                if len(parts) < 2:
-                    # File at root level - decide how to handle this case.
-                    # If no directory structure is given, skip or treat differently.
-                    continue
-                shard_id = parts[-2]  # get the second-last directory as shard_id
-                shard_dict[shard_id].append(info.filename)
-
-            # Apply max_files limit if provided
-            shard_ids = list(shard_dict.keys())
-            if cfg.max_files is not None:
-                shard_ids = shard_ids[: cfg.max_files]
-
-            logger.info(f"Found {len(shard_ids)} shards to process.")
-
-            # Build task list for each shard
-            shard_tasks = []
-            for shard_id in shard_ids:
-                shard_tasks.append(
-                    {
-                        "input_path": cfg.input_path,
-                        "output_path": cfg.output_path,
-                        "shard_id": shard_id,
-                        "file_list": shard_dict[shard_id],
-                    }
-                )
-
-    # Execute pipeline with zephyr
-    pipeline = (
-        Dataset.from_list(shard_tasks)
-        .map(process_shard)
-        .write_jsonl(f"{cfg.output_path}/.metrics/part-{{shard:05d}}.jsonl", skip_existing=True)
-    )
-    ctx = ZephyrContext(name="download-ar5iv")
-    ctx.execute(pipeline)
-
-    logger.info("Transfer completed successfully!")
-
-
-def ar5iv_step(
-    name: str = "raw/ar5iv",
-    *,
-    input_path: str,
-    max_files: int | None = None,
-    deps: list[StepSpec] | None = None,
-    output_path_prefix: str | None = None,
-    override_output_path: str | None = None,
-) -> StepSpec:
-    """Create a StepSpec that downloads and processes the Ar5iv dataset from a zip file."""
-
-    def _run(output_path: str) -> None:
-        download(Ar5ivDownloadConfig(input_path=input_path, output_path=output_path, max_files=max_files))
-
-    return StepSpec(
-        name=name,
-        fn=_run,
-        deps=deps or [],
-        hash_attrs={"input_path": input_path, "max_files": max_files},
-        output_path_prefix=output_path_prefix,
-        override_output_path=override_output_path,
-    )
-
-
-@draccus.wrap()
-def main(cfg: Ar5ivDownloadConfig) -> None:
-    """CLI entrypoint for downloading and processing Ar5iv dataset."""
-
-    configure_logging(level=logging.INFO)
-    download(cfg)
+from marin.datakit.download.ar5iv.download import Ar5ivDownloadConfig as Ar5ivDownloadConfig
+from marin.datakit.download.ar5iv.download import ar5iv_step as ar5iv_step
+from marin.datakit.download.ar5iv.download import download as download
+from marin.datakit.download.ar5iv.download import process_shard as process_shard
diff --git a/lib/marin/src/marin/datakit/download/ar5iv/download.py b/lib/marin/src/marin/datakit/download/ar5iv/download.py
new file mode 100644
index 0000000000..86498e12e1
--- /dev/null
+++ b/lib/marin/src/marin/datakit/download/ar5iv/download.py
@@ -0,0 +1,160 @@
+# Copyright The Marin Authors
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+Download and process Ar5iv dataset from a zip file.
+
+Example Usage:
+uv run zephyr --backend=ray --max-parallelism=1000 --memory=10GB \
+    lib/marin/src/marin/download/ar5iv/download.py \
+    --input_path gs://bucket/ar5iv.zip \
+    --output_path gs://bucket/output
+"""
+
+import json
+import logging
+import zipfile
+from collections import defaultdict
+from dataclasses import dataclass
+
+import draccus
+from iris.marin_fs import open_url
+from marin.execution.step_spec import StepSpec
+from zephyr import Dataset, ZephyrContext
+from zephyr.writers import atomic_rename
+from iris.logging import configure_logging
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class Ar5ivDownloadConfig:
+    input_path: str
+    output_path: str
+    max_files: int | None = None  # Maximum number of shards to process
+
+
+def process_shard(shard_task: dict) -> dict:
+    """
+    Process a single shard by extracting its files from the zip in GCS and uploading the merged JSONL.
+
+    Args:
+        shard_task: Dict with keys 'input_path', 'output_path', 'shard_id', 'file_list'
+    """
+    input_path = shard_task["input_path"]
+    output_path = shard_task["output_path"]
+    shard_id = shard_task["shard_id"]
+    file_list = shard_task["file_list"]
+    gcs_path = f"{output_path}/{shard_id}.jsonl.gz"
+
+    with open_url(str(input_path), "rb") as f:
+        with zipfile.ZipFile(f) as zf:
+            with atomic_rename(gcs_path) as temp_path, open_url(temp_path, "wt", compression="gzip") as out_f:
+                for filename in file_list:
+                    with zf.open(filename, "r") as file_handle:
+                        content = file_handle.read()
+                        record = {
+                            "filename": filename,
+                            "format": "html",
+                            "content": content.decode("utf-8", errors="replace"),
+                        }
+                        print(json.dumps(record), file=out_f)
+
+            logger.info(f"Shard {shard_id} with {len(file_list)} files uploaded to {gcs_path}")
+            return {"shard_id": shard_id, "num_files": len(file_list), "output_path": gcs_path}
+
+
+def download(cfg: Ar5ivDownloadConfig) -> None:
+    """
+    Download and process Ar5iv dataset from a zip file in GCS.
+
+    This function can be called by the executor framework or used standalone.
+    """
+    logger.info("Starting transfer of Ar5iv dataset...")
+    logger.info(f"Source: {cfg.input_path}")
+
+    # Use fsspec+zipfile to list all files
+    with open_url(str(cfg.input_path), "rb") as f:
+        with zipfile.ZipFile(f) as zf:
+            all_files = zf.infolist()
+
+            # Group by shard directory
+            # We assume structure: something like: shard_id/.../file
+            # shard_id is derived from the second last component if files are nested.
+            # Adjust as needed if directory structure differs.
+            shard_dict = defaultdict(list)
+            for info in all_files:
+                if info.is_dir():
+                    continue
+                # E.g. path might look like: "003/something.html"
+                # Extract shard_id from the directory:
+                # Split by "/" and take the first part if we assume structure {shard_id}/file
+                parts = info.filename.strip("/").split("/")
+                if len(parts) < 2:
+                    # File at root level - decide how to handle this case.
+                    # If no directory structure is given, skip or treat differently.
+                    continue
+                shard_id = parts[-2]  # get the second-last directory as shard_id
+                shard_dict[shard_id].append(info.filename)
+
+            # Apply max_files limit if provided
+            shard_ids = list(shard_dict.keys())
+            if cfg.max_files is not None:
+                shard_ids = shard_ids[: cfg.max_files]
+
+            logger.info(f"Found {len(shard_ids)} shards to process.")
+
+            # Build task list for each shard
+            shard_tasks = []
+            for shard_id in shard_ids:
+                shard_tasks.append(
+                    {
+                        "input_path": cfg.input_path,
+                        "output_path": cfg.output_path,
+                        "shard_id": shard_id,
+                        "file_list": shard_dict[shard_id],
+                    }
+                )
+
+    # Execute pipeline with zephyr
+    pipeline = (
+        Dataset.from_list(shard_tasks)
+        .map(process_shard)
+        .write_jsonl(f"{cfg.output_path}/.metrics/part-{{shard:05d}}.jsonl", skip_existing=True)
+    )
+    ctx = ZephyrContext(name="download-ar5iv")
+    ctx.execute(pipeline)
+
+    logger.info("Transfer completed successfully!")
+
+
+def ar5iv_step(
+    name: str = "raw/ar5iv",
+    *,
+    input_path: str,
+    max_files: int | None = None,
+    deps: list[StepSpec] | None = None,
+    output_path_prefix: str | None = None,
+    override_output_path: str | None = None,
+) -> StepSpec:
+    """Create a StepSpec that downloads and processes the Ar5iv dataset from a zip file."""
+
+    def _run(output_path: str) -> None:
+        download(Ar5ivDownloadConfig(input_path=input_path, output_path=output_path, max_files=max_files))
+
+    return StepSpec(
+        name=name,
+        fn=_run,
+        deps=deps or [],
+        hash_attrs={"input_path": input_path, "max_files": max_files},
+        output_path_prefix=output_path_prefix,
+        override_output_path=override_output_path,
+    )
+
+
+@draccus.wrap()
+def main(cfg: Ar5ivDownloadConfig) -> None:
+    """CLI entrypoint for downloading and processing Ar5iv dataset."""
+
+    configure_logging(level=logging.INFO)
+    download(cfg)

From 7601f45cf1d6340e2abaa859db2dd018e34e929c Mon Sep 17 00:00:00 2001
From: Rafal Wojdyla <ravwojdyla@gmail.com>
Date: Wed, 25 Mar 2026 14:49:59 -0700
Subject: [PATCH 22/56] Delete unused stream_remove_columns module and its test

Zero production consumers. The test_prune_hf_dataset test only
exercised the deleted module.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .../datakit/download/stream_remove_columns.py | 100 ------------------
 tests/download/test_huggingface.py            |  49 ---------
 2 files changed, 149 deletions(-)
 delete mode 100644 lib/marin/src/marin/datakit/download/stream_remove_columns.py

diff --git a/lib/marin/src/marin/datakit/download/stream_remove_columns.py b/lib/marin/src/marin/datakit/download/stream_remove_columns.py
deleted file mode 100644
index ba883ee944..0000000000
--- a/lib/marin/src/marin/datakit/download/stream_remove_columns.py
+++ /dev/null
@@ -1,100 +0,0 @@
-# Copyright The Marin Authors
-# SPDX-License-Identifier: Apache-2.0
-
-"""Remove unnecessary columns while streaming data from huggingface."""
-
-import logging
-import os
-from dataclasses import dataclass
-
-import pandas as pd
-import pyarrow.parquet as pq
-from huggingface_hub import HfFileSystem
-from tqdm import tqdm
-from zephyr import Dataset, ZephyrContext
-
-logger = logging.getLogger(__name__)
-
-
-def prune_stream_and_save(input_file: str, output_file: str, keep_columns: list[str]):
-    """
-    Prunes and saves a parquet file by removing un-specified columns.
-
-    Reads the input parquet file in batches, removes columns not in keep_columns,
-    and writes the result to output_file. Processing in batches avoids memory issues.
-
-    Args:
-        input_file (str): Path to input parquet file on HuggingFace
-        output_file (str): Path where pruned parquet file will be saved
-        keep_columns (list[str]): List of column names to retain
-    """
-    parquet_file = pq.ParquetFile(HfFileSystem().open(input_file))
-
-    full_df_list = []
-    for batch in tqdm(parquet_file.iter_batches(batch_size=10000), desc=f"Processing {input_file}"):
-        df = batch.to_pandas()
-
-        drop_columns = [col for col in df.columns if col not in keep_columns]
-        df = df.drop(columns=drop_columns)
-
-        full_df_list.append(df)
-
-    full_df = pd.concat(full_df_list)
-    logger.info(f"Saving pruned dataset of shape {full_df.shape} to {output_file}")
-    full_df.to_parquet(output_file, index=False)
-
-
-def get_file_tasks(hf_path: str, output_path: str, keep_columns: list[str]):
-    """
-    Generate file processing tasks for a HuggingFace subset.
-
-    Args:
-        hf_path (str): The HuggingFace dataset path to load
-        output_path (str): The output path to save the pruned dataset
-        keep_columns (list[str]): The columns to keep in the pruned dataset
-
-    Yields:
-        Dict with input_file, output_file, and keep_columns for each parquet file
-    """
-    logger.info(f"Loading dataset from {hf_path}")
-    parquet_list = HfFileSystem().glob(f"{hf_path}/*.parquet")
-
-    for file in parquet_list:
-        output_file = os.path.join(output_path, os.path.basename(file))
-        yield {"input_file": file, "output_file": output_file, "keep_columns": keep_columns}
-
-
-@dataclass
-class DatasetConfig:
-    hf_repo_id: str
-    hf_revision: str
-    hf_paths: list[str]
-    output_path: str
-    keep_columns: list[str]
-
-
-def prune_hf_dataset(cfg: DatasetConfig):
-    logger.info(f"Starting dataset pruning for {cfg.hf_paths}")
-
-    # Build list of subset paths to process
-    subset_tasks = []
-    for path in cfg.hf_paths:
-        # HF Path form: hf://[<repo_type_prefix>]<repo_id>[@<revision>]/<path/in/repo>
-        hf_path = f"hf://datasets/{cfg.hf_repo_id}@{cfg.hf_revision}/{path}"
-        logger.info(f"Processing subset {hf_path}")
-        output_path = os.path.join(cfg.output_path, path)
-        subset_tasks.append({"hf_path": hf_path, "output_path": output_path})
-
-    # Build pipeline with nested parallelism:
-    # - Outer level: process subsets (MAX_CONCURRENT_WORKERS=1)
-    # - Inner level: process files within each subset
-    pipeline = (
-        Dataset.from_list(subset_tasks)
-        .flat_map(lambda task: get_file_tasks(task["hf_path"], task["output_path"], cfg.keep_columns))
-        .map(lambda task: prune_stream_and_save(task["input_file"], task["output_file"], cfg.keep_columns))
-    )
-
-    logger.info("Executing pipeline")
-    ctx = ZephyrContext(name="hf-remove-columns")
-    ctx.execute(pipeline)
-    logger.info("Successfully processed all subsets")
diff --git a/tests/download/test_huggingface.py b/tests/download/test_huggingface.py
index 4d16eadf6b..f055cc94ca 100644
--- a/tests/download/test_huggingface.py
+++ b/tests/download/test_huggingface.py
@@ -7,7 +7,6 @@
 import json
 from unittest.mock import MagicMock, Mock, patch
 
-import pandas as pd
 import pytest
 
 from marin.datakit.download.huggingface import (
@@ -16,10 +15,6 @@
     download_hf,
     stream_file_to_fsspec,
 )
-from marin.datakit.download.stream_remove_columns import (
-    DatasetConfig,
-    prune_hf_dataset,
-)
 
 
 @pytest.fixture
@@ -155,50 +150,6 @@ def test_download_hf_bucket_requires_newer_huggingface_hub(tmp_path):
         download_hf(cfg)
 
 
-def test_prune_hf_dataset(tmp_path):
-    """Test full dataset pruning pipeline."""
-    # Create test parquet data
-    test_data = pd.DataFrame(
-        {
-            "id": [1, 2],
-            "text": ["hello", "world"],
-            "unwanted": ["a", "b"],
-        }
-    )
-
-    # Create multiple buffers since each call needs a fresh one
-    def create_buffer():
-        buffer = io.BytesIO()
-        test_data.to_parquet(buffer, index=False)
-        buffer.seek(0)
-        return buffer
-
-    cfg = DatasetConfig(
-        hf_repo_id="test-org/test-dataset",
-        hf_revision="main",
-        hf_paths=["data"],
-        output_path=str(tmp_path / "output"),
-        keep_columns=["id", "text"],
-    )
-
-    # Create output directory structure
-    output_dir = tmp_path / "output" / "data"
-    output_dir.mkdir(parents=True)
-
-    mock_fs = MagicMock()
-    mock_fs.glob = Mock(return_value=["hf://datasets/test-org/test-dataset@main/data/file.parquet"])
-    mock_fs.open = Mock(side_effect=lambda path, mode="rb": create_buffer())
-
-    with patch("marin.datakit.download.stream_remove_columns.HfFileSystem", return_value=mock_fs):
-        prune_hf_dataset(cfg)
-
-    # Verify output
-    output_file = tmp_path / "output" / "data" / "file.parquet"
-    assert output_file.exists()
-    result_df = pd.read_parquet(output_file)
-    assert list(result_df.columns) == ["id", "text"]
-
-
 def test_stream_file_to_fsspec_retries_on_timeout(tmp_path):
     """A socket timeout while reading should trigger retry and then succeed."""
     file_path = "datasets/test-org/test-dataset/data/file1.txt"

From 64f5c48c365dcd37954b7b592e92a3d314f5c821 Mon Sep 17 00:00:00 2001
From: Rafal Wojdyla <ravwojdyla@gmail.com>
Date: Wed, 25 Mar 2026 14:51:23 -0700
Subject: [PATCH 23/56] Remove unused dclm_hq_step function
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

No consumers — extract_dclm_hq_dump is called directly by
transform_dclm_hq, not as a standalone download step.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .../src/marin/datakit/download/dclm_hq.py     | 24 -------------------
 1 file changed, 24 deletions(-)

diff --git a/lib/marin/src/marin/datakit/download/dclm_hq.py b/lib/marin/src/marin/datakit/download/dclm_hq.py
index a4301245aa..b473768ec0 100644
--- a/lib/marin/src/marin/datakit/download/dclm_hq.py
+++ b/lib/marin/src/marin/datakit/download/dclm_hq.py
@@ -23,7 +23,6 @@
 
 import requests
 from iris.marin_fs import open_url
-from marin.execution.step_spec import StepSpec
 import warcio
 from marin.utils import fsspec_glob
 from tqdm import tqdm
@@ -193,26 +192,3 @@ def extract_dclm_hq_dump(input_path: str, output_path: str) -> None:
     ctx.execute(pipeline)
 
     logger.info("Processing completed successfully!")
-
-
-def dclm_hq_step(
-    name: str = "raw/dclm-hq-html",
-    *,
-    input_path: str,
-    deps: list[StepSpec] | None = None,
-    output_path_prefix: str | None = None,
-    override_output_path: str | None = None,
-) -> StepSpec:
-    """Create a StepSpec that downloads DCLM HQ HTML data from Common Crawl."""
-
-    def _run(output_path: str) -> None:
-        extract_dclm_hq_dump(input_path, output_path)
-
-    return StepSpec(
-        name=name,
-        fn=_run,
-        deps=deps or [],
-        hash_attrs={"input_path": input_path},
-        output_path_prefix=output_path_prefix,
-        override_output_path=override_output_path,
-    )

From 76e3336775dfa26d40749f0596575bfca79906e7 Mon Sep 17 00:00:00 2001
From: Rafal Wojdyla <ravwojdyla@gmail.com>
Date: Wed, 25 Mar 2026 15:08:32 -0700
Subject: [PATCH 24/56] Simplify nemotron_cc_step to download_nemotron_cc_step
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Remove unnecessary parameters (deps, output_path_prefix,
override_output_path) from the step function — this download
takes no configuration. Rename for consistency.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 experiments/pretraining_datasets/nemotron.py  |  4 +--
 .../src/marin/datakit/download/nemotron_cc.py | 26 +++----------------
 2 files changed, 5 insertions(+), 25 deletions(-)

diff --git a/experiments/pretraining_datasets/nemotron.py b/experiments/pretraining_datasets/nemotron.py
index 4c463d8e4f..7a5b50afb0 100644
--- a/experiments/pretraining_datasets/nemotron.py
+++ b/experiments/pretraining_datasets/nemotron.py
@@ -8,14 +8,14 @@
 
 from experiments.defaults import DEFAULT_NEW_RUN_DATA_SHUFFLE
 from experiments.pretraining_datasets.dclm import dclm_components_llama3
-from marin.datakit.download.nemotron_cc import nemotron_cc_step
+from marin.datakit.download.nemotron_cc import download_nemotron_cc_step
 from marin.execution.executor import ExecutorStep, output_path_of, this_output_path, versioned
 from marin.processing.tokenize import TokenizeConfig, lm_mixture_data_config, tokenize
 from marin.processing.tokenize.data_configs import TokenizerStep
 
 # Raw dataset download step
 downloads = {
-    "nemotron_cc": nemotron_cc_step("raw/nemotro-cc").as_executor_step(),
+    "nemotron_cc": download_nemotron_cc_step("raw/nemotro-cc").as_executor_step(),
 }
 
 _nemotron_cc_path = output_path_of(downloads["nemotron_cc"], "contrib/Nemotron/Nemotron-CC/data-jsonl/")
diff --git a/lib/marin/src/marin/datakit/download/nemotron_cc.py b/lib/marin/src/marin/datakit/download/nemotron_cc.py
index 0e65f307b9..8ba11e95b1 100644
--- a/lib/marin/src/marin/datakit/download/nemotron_cc.py
+++ b/lib/marin/src/marin/datakit/download/nemotron_cc.py
@@ -1,14 +1,7 @@
 # Copyright The Marin Authors
 # SPDX-License-Identifier: Apache-2.0
 
-"""
-Download and process Nemotron-CC dataset from Common Crawl.
-
-Example Usage:
-uv run zephyr --backend=ray --max-parallelism=100 --memory=4GB \
-    lib/marin/src/marin/download/nemotron_cc/download_nemotron_cc.py \
-    --output_path gs://bucket/nemotron-output
-"""
+"""Download and process Nemotron-CC dataset from Common Crawl"""
 
 import json
 import logging
@@ -115,23 +108,10 @@ def download_nemotron_cc(output_path: str) -> None:
     logger.info(f"Downloaded Nemotron CC files to {output_path}")
 
 
-def nemotron_cc_step(
-    name: str = "raw/nemotron-cc",
-    *,
-    deps: list[StepSpec] | None = None,
-    output_path_prefix: str | None = None,
-    override_output_path: str | None = None,
-) -> StepSpec:
+def download_nemotron_cc_step(name: str = "raw/nemotron-cc") -> StepSpec:
     """Create a StepSpec that downloads the Nemotron-CC dataset from Common Crawl."""
 
-    def _run(output_path: str) -> None:
-        download_nemotron_cc(output_path)
-
     return StepSpec(
         name=name,
-        fn=_run,
-        deps=deps or [],
-        hash_attrs={},
-        output_path_prefix=output_path_prefix,
-        override_output_path=override_output_path,
+        fn=lambda output_path: download_nemotron_cc(output_path=output_path),
     )

From 90b9b65ef5555c3bf133c8aa613870e458aafc73 Mon Sep 17 00:00:00 2001
From: Rafal Wojdyla <ravwojdyla@gmail.com>
Date: Wed, 25 Mar 2026 15:12:34 -0700
Subject: [PATCH 25/56] Rename nemotron_cc.py to nemotron_v1.py

Prepares for adding nemotron_v2 download module alongside.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 experiments/pretraining_datasets/nemotron.py                | 2 +-
 .../datakit/download/{nemotron_cc.py => nemotron_v1.py}     | 0
 tests/download/test_nemotron_cc.py                          | 6 +++---
 3 files changed, 4 insertions(+), 4 deletions(-)
 rename lib/marin/src/marin/datakit/download/{nemotron_cc.py => nemotron_v1.py} (100%)

diff --git a/experiments/pretraining_datasets/nemotron.py b/experiments/pretraining_datasets/nemotron.py
index 7a5b50afb0..35d3d86e2b 100644
--- a/experiments/pretraining_datasets/nemotron.py
+++ b/experiments/pretraining_datasets/nemotron.py
@@ -8,7 +8,7 @@
 
 from experiments.defaults import DEFAULT_NEW_RUN_DATA_SHUFFLE
 from experiments.pretraining_datasets.dclm import dclm_components_llama3
-from marin.datakit.download.nemotron_cc import download_nemotron_cc_step
+from marin.datakit.download.nemotron_v1 import download_nemotron_cc_step
 from marin.execution.executor import ExecutorStep, output_path_of, this_output_path, versioned
 from marin.processing.tokenize import TokenizeConfig, lm_mixture_data_config, tokenize
 from marin.processing.tokenize.data_configs import TokenizerStep
diff --git a/lib/marin/src/marin/datakit/download/nemotron_cc.py b/lib/marin/src/marin/datakit/download/nemotron_v1.py
similarity index 100%
rename from lib/marin/src/marin/datakit/download/nemotron_cc.py
rename to lib/marin/src/marin/datakit/download/nemotron_v1.py
diff --git a/tests/download/test_nemotron_cc.py b/tests/download/test_nemotron_cc.py
index e4e89e361a..e8ed0e2de1 100644
--- a/tests/download/test_nemotron_cc.py
+++ b/tests/download/test_nemotron_cc.py
@@ -9,10 +9,10 @@
 import pytest
 import zstandard as zstd
 from iris.marin_fs import open_url as _real_open_url
-from marin.datakit.download.nemotron_cc import download_nemotron_cc
+from marin.datakit.download.nemotron_v1 import download_nemotron_cc
 
-_OPEN_URL_TARGET = "marin.datakit.download.nemotron_cc.open_url"
-_REQUESTS_SESSION_TARGET = "marin.datakit.download.nemotron_cc.requests.Session"
+_OPEN_URL_TARGET = "marin.datakit.download.nemotron_v1.open_url"
+_REQUESTS_SESSION_TARGET = "marin.datakit.download.nemotron_v1.requests.Session"
 
 SAMPLE_NEMOTRON_RECORDS = [
     {

From 1760f2f089eb9221551113a848b7777f9242cd03 Mon Sep 17 00:00:00 2001
From: Rafal Wojdyla <ravwojdyla@gmail.com>
Date: Wed, 25 Mar 2026 15:14:39 -0700
Subject: [PATCH 26/56] Extract nemotron_v2 download definitions into
 datakit/download/nemotron_v2.py

Moves NEMOTRON_V2_DATASETS and nemotron_v2_download_step() from
experiments/pretraining_datasets/nemotron_v2.py into a datakit module.
Replaces the raw dict with a NemotronV2Dataset dataclass. The experiment
file now imports definitions and only wires tokenization.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 experiments/pretraining_datasets/__init__.py  |   2 +-
 .../pretraining_datasets/nemotron_v2.py       | 124 ++----------------
 .../src/marin/datakit/download/nemotron_v2.py | 122 +++++++++++++++++
 3 files changed, 131 insertions(+), 117 deletions(-)
 create mode 100644 lib/marin/src/marin/datakit/download/nemotron_v2.py

diff --git a/experiments/pretraining_datasets/__init__.py b/experiments/pretraining_datasets/__init__.py
index 090a298498..6ca2bff80f 100644
--- a/experiments/pretraining_datasets/__init__.py
+++ b/experiments/pretraining_datasets/__init__.py
@@ -130,7 +130,7 @@
     # Nemotron v2 datasets (from nvidia/Nemotron-Pre-Training-Datasets collection)
     **{
         family: {
-            "subsets": list(info["subsets"].keys()),
+            "subsets": list(info.subsets.keys()),
             "download": nemotron_v2_downloads[family],
             "tokenize_fn": lambda f=family: tokenize_nemotron_v2_family(f),
         }
diff --git a/experiments/pretraining_datasets/nemotron_v2.py b/experiments/pretraining_datasets/nemotron_v2.py
index ccb79f9e14..b3cd1d6760 100644
--- a/experiments/pretraining_datasets/nemotron_v2.py
+++ b/experiments/pretraining_datasets/nemotron_v2.py
@@ -2,134 +2,26 @@
 # SPDX-License-Identifier: Apache-2.0
 
 """
-Nemotron v2 pre-training dataset definitions and tokenization.
+Nemotron v2 pre-training dataset tokenization.
 
-These datasets come from the nvidia/Nemotron-Pre-Training-Datasets collection
-on HuggingFace. They are additive to the original Nemotron-CC (v1) dataset
-defined in nemotron.py.
-
-Most of these datasets are gated and require HF_TOKEN at download time.
-All use parquet format with a "text" field.
+Download definitions live in marin.datakit.download.nemotron_v2.
+This file wires them into tokenization steps for experiment pipelines.
 """
 
 import os.path
 
-from marin.datakit.download.huggingface import DownloadConfig, download_hf
+from marin.datakit.download.nemotron_v2 import NEMOTRON_V2_DATASETS, nemotron_v2_download_step
 from marin.execution.executor import ExecutorStep, this_output_path, versioned
 from marin.processing.tokenize import TokenizeConfig, tokenize
 from marin.processing.tokenize.data_configs import TokenizerStep
 
-# ============================================================================
-# DATASET DEFINITIONS
-# ============================================================================
-
-# Each entry: (hf_id, revision, subsets_dict)
-# subsets_dict maps subset_name -> glob pattern for parquet files within the download
-
-NEMOTRON_V2_DATASETS = {
-    "nemotron_cc_v2": {
-        "hf_dataset_id": "nvidia/Nemotron-CC-v2",
-        "revision": "229a2e7",
-        "subsets": {
-            "diverse_qa": "Diverse-QA/**/*.parquet",
-            "high_quality": "High-Quality/**/*.parquet",
-            "high_quality_synthetic": "High-Quality-Synthetic/**/*.parquet",
-            "medium_high_quality": "Medium-High-Quality/**/*.parquet",
-            "medium_quality": "Medium-Quality/**/*.parquet",
-            "translated_diverse_qa": "Translated-Diverse-QA/**/*.parquet",
-        },
-    },
-    "nemotron_cc_v2_1": {
-        "hf_dataset_id": "nvidia/Nemotron-CC-v2.1",
-        "revision": "ba6f2aa",
-        "subsets": {
-            "high_quality": "High-Quality/**/*.parquet",
-            "high_quality_dqa": "High-Quality-DQA/**/*.parquet",
-            "high_quality_synthetic": "High-Quality-Synthetic/**/*.parquet",
-            "high_quality_translated": "High-Quality-Translated-To-English/**/*.parquet",
-            "high_quality_translated_synthetic": "High-Quality-Translated-To-English-Synthetic/**/*.parquet",
-            "medium_high_quality": "Medium-High-Quality/**/*.parquet",
-            "medium_high_quality_synthetic": "Medium-High-Quality-Synthetic/**/*.parquet",
-            "medium_high_quality_translated": "Medium-High-Quality-Translated-To-English/**/*.parquet",
-            "medium_quality": "Medium-Quality/**/*.parquet",
-        },
-    },
-    "nemotron_cc_code_v1": {
-        "hf_dataset_id": "nvidia/Nemotron-CC-Code-v1",
-        "revision": "5c5bebc",
-        "subsets": {
-            "all": "data/**/*.parquet",
-        },
-    },
-    "nemotron_cc_math_v1": {
-        "hf_dataset_id": "nvidia/Nemotron-CC-Math-v1",
-        "revision": "397a250",
-        "subsets": {
-            "3": "3/**/*.parquet",
-            "4plus": "4plus/**/*.parquet",
-            "4plus_mind": "4plus_MIND/**/*.parquet",
-        },
-    },
-    "nemotron_pretraining_code_v1": {
-        "hf_dataset_id": "nvidia/Nemotron-Pretraining-Code-v1",
-        "revision": "01393d3",
-        "subsets": {
-            "synthetic_code": "Synthetic-Code/**/*.parquet",
-            "code_metadata": "Nemotron-Code-Metadata/**/*.parquet",
-        },
-    },
-    "nemotron_pretraining_code_v2": {
-        "hf_dataset_id": "nvidia/Nemotron-Pretraining-Code-v2",
-        "revision": "7b1a453",
-        "subsets": {
-            "code_metadata": "Nemotron-Code-Metadata/**/*.parquet",
-            "synthetic_question_answering": "Synthetic-Question-Answering/**/*.parquet",
-            "synthetic_student_teacher": "Synthetic-Student-Teacher/**/*.parquet",
-            "synthetic_code_review": "Synthetic-Code-Review/**/*.parquet",
-            "synthetic_rewriting": "Synthetic-Rewriting/**/*.parquet",
-            "synthetic_transpilation": "Synthetic-Transpilation/**/*.parquet",
-        },
-    },
-    "nemotron_pretraining_specialized_v1": {
-        "hf_dataset_id": "nvidia/Nemotron-Pretraining-Specialized-v1",
-        "revision": "9ed3718",
-        "subsets": {
-            "wiki_rewrite": "Nemotron-Pretraining-Wiki-Rewrite/**/*.parquet",
-            "math_textbooks": "Nemotron-Pretraining-Math-Textbooks/**/*.parquet",
-            "stem_sft": "Nemotron-Pretraining-STEM-SFT/**/*.parquet",
-            "scientific_coding": "Nemotron-Pretraining-Scientific-Coding/**/*.parquet",
-            "rqa": "Nemotron-Pretraining-RQA/**/*.parquet",
-            "infinibyte_reasoning": "Nemotron-Pretraining-InfiniByte-Reasoning/**/*.parquet",
-        },
-    },
-    "nemotron_pretraining_sft_v1": {
-        "hf_dataset_id": "nvidia/Nemotron-Pretraining-SFT-v1",
-        "revision": "3f1a5b8",
-        "subsets": {
-            "sft_code": "Nemotron-SFT-Code/**/*.parquet",
-            "sft_general": "Nemotron-SFT-General/**/*.parquet",
-            "sft_math": "Nemotron-SFT-MATH/**/*.parquet",
-        },
-    },
-}
-
-
 # ============================================================================
 # RAW DATASET DOWNLOADS
 # ============================================================================
 
-downloads: dict[str, ExecutorStep] = {}
-for _family, _info in NEMOTRON_V2_DATASETS.items():
-    downloads[_family] = ExecutorStep(
-        name=f"raw/{_family}",
-        fn=download_hf,
-        config=DownloadConfig(
-            hf_dataset_id=_info["hf_dataset_id"],
-            revision=versioned(_info["revision"]),
-            gcs_output_path=this_output_path(),
-            wait_for_completion=True,
-        ),
-    )
+downloads: dict[str, ExecutorStep] = {
+    family: nemotron_v2_download_step(family).as_executor_step() for family in NEMOTRON_V2_DATASETS
+}
 
 
 # ============================================================================
@@ -152,7 +44,7 @@ def tokenize_nemotron_v2_family(
     download_step = downloads[family]
 
     steps: dict[str, ExecutorStep[TokenizeConfig]] = {}
-    for subset, glob_pattern in info["subsets"].items():
+    for subset, glob_pattern in info.subsets.items():
         output_name = os.path.join("tokenized", family, subset)
         step = ExecutorStep(
             name=output_name,
diff --git a/lib/marin/src/marin/datakit/download/nemotron_v2.py b/lib/marin/src/marin/datakit/download/nemotron_v2.py
new file mode 100644
index 0000000000..60b4f7902b
--- /dev/null
+++ b/lib/marin/src/marin/datakit/download/nemotron_v2.py
@@ -0,0 +1,122 @@
+# Copyright The Marin Authors
+# SPDX-License-Identifier: Apache-2.0
+
+"""Nemotron v2 pre-training dataset download definitions.
+
+These datasets come from the nvidia/Nemotron-Pre-Training-Datasets collection
+on HuggingFace. They are additive to the original Nemotron-CC (v1) dataset.
+
+Most of these datasets are gated and require HF_TOKEN at download time.
+All use parquet format with a "text" field.
+"""
+
+from dataclasses import dataclass, field
+
+from marin.datakit.download.huggingface import download_hf_step
+from marin.execution.step_spec import StepSpec
+
+
+@dataclass(frozen=True)
+class NemotronV2Dataset:
+    """Metadata for a single Nemotron v2 HuggingFace dataset."""
+
+    hf_dataset_id: str
+    revision: str
+    subsets: dict[str, str] = field(default_factory=dict)
+    """Maps subset_name -> glob pattern for parquet files within the download."""
+
+
+NEMOTRON_V2_DATASETS: dict[str, NemotronV2Dataset] = {
+    "nemotron_cc_v2": NemotronV2Dataset(
+        hf_dataset_id="nvidia/Nemotron-CC-v2",
+        revision="229a2e7",
+        subsets={
+            "diverse_qa": "Diverse-QA/**/*.parquet",
+            "high_quality": "High-Quality/**/*.parquet",
+            "high_quality_synthetic": "High-Quality-Synthetic/**/*.parquet",
+            "medium_high_quality": "Medium-High-Quality/**/*.parquet",
+            "medium_quality": "Medium-Quality/**/*.parquet",
+            "translated_diverse_qa": "Translated-Diverse-QA/**/*.parquet",
+        },
+    ),
+    "nemotron_cc_v2_1": NemotronV2Dataset(
+        hf_dataset_id="nvidia/Nemotron-CC-v2.1",
+        revision="ba6f2aa",
+        subsets={
+            "high_quality": "High-Quality/**/*.parquet",
+            "high_quality_dqa": "High-Quality-DQA/**/*.parquet",
+            "high_quality_synthetic": "High-Quality-Synthetic/**/*.parquet",
+            "high_quality_translated": "High-Quality-Translated-To-English/**/*.parquet",
+            "high_quality_translated_synthetic": "High-Quality-Translated-To-English-Synthetic/**/*.parquet",
+            "medium_high_quality": "Medium-High-Quality/**/*.parquet",
+            "medium_high_quality_synthetic": "Medium-High-Quality-Synthetic/**/*.parquet",
+            "medium_high_quality_translated": "Medium-High-Quality-Translated-To-English/**/*.parquet",
+            "medium_quality": "Medium-Quality/**/*.parquet",
+        },
+    ),
+    "nemotron_cc_code_v1": NemotronV2Dataset(
+        hf_dataset_id="nvidia/Nemotron-CC-Code-v1",
+        revision="5c5bebc",
+        subsets={"all": "data/**/*.parquet"},
+    ),
+    "nemotron_cc_math_v1": NemotronV2Dataset(
+        hf_dataset_id="nvidia/Nemotron-CC-Math-v1",
+        revision="397a250",
+        subsets={
+            "3": "3/**/*.parquet",
+            "4plus": "4plus/**/*.parquet",
+            "4plus_mind": "4plus_MIND/**/*.parquet",
+        },
+    ),
+    "nemotron_pretraining_code_v1": NemotronV2Dataset(
+        hf_dataset_id="nvidia/Nemotron-Pretraining-Code-v1",
+        revision="01393d3",
+        subsets={
+            "synthetic_code": "Synthetic-Code/**/*.parquet",
+            "code_metadata": "Nemotron-Code-Metadata/**/*.parquet",
+        },
+    ),
+    "nemotron_pretraining_code_v2": NemotronV2Dataset(
+        hf_dataset_id="nvidia/Nemotron-Pretraining-Code-v2",
+        revision="7b1a453",
+        subsets={
+            "code_metadata": "Nemotron-Code-Metadata/**/*.parquet",
+            "synthetic_question_answering": "Synthetic-Question-Answering/**/*.parquet",
+            "synthetic_student_teacher": "Synthetic-Student-Teacher/**/*.parquet",
+            "synthetic_code_review": "Synthetic-Code-Review/**/*.parquet",
+            "synthetic_rewriting": "Synthetic-Rewriting/**/*.parquet",
+            "synthetic_transpilation": "Synthetic-Transpilation/**/*.parquet",
+        },
+    ),
+    "nemotron_pretraining_specialized_v1": NemotronV2Dataset(
+        hf_dataset_id="nvidia/Nemotron-Pretraining-Specialized-v1",
+        revision="9ed3718",
+        subsets={
+            "wiki_rewrite": "Nemotron-Pretraining-Wiki-Rewrite/**/*.parquet",
+            "math_textbooks": "Nemotron-Pretraining-Math-Textbooks/**/*.parquet",
+            "stem_sft": "Nemotron-Pretraining-STEM-SFT/**/*.parquet",
+            "scientific_coding": "Nemotron-Pretraining-Scientific-Coding/**/*.parquet",
+            "rqa": "Nemotron-Pretraining-RQA/**/*.parquet",
+            "infinibyte_reasoning": "Nemotron-Pretraining-InfiniByte-Reasoning/**/*.parquet",
+        },
+    ),
+    "nemotron_pretraining_sft_v1": NemotronV2Dataset(
+        hf_dataset_id="nvidia/Nemotron-Pretraining-SFT-v1",
+        revision="3f1a5b8",
+        subsets={
+            "sft_code": "Nemotron-SFT-Code/**/*.parquet",
+            "sft_general": "Nemotron-SFT-General/**/*.parquet",
+            "sft_math": "Nemotron-SFT-MATH/**/*.parquet",
+        },
+    ),
+}
+
+
+def nemotron_v2_download_step(family: str) -> StepSpec:
+    """Create a download StepSpec for a Nemotron v2 dataset family."""
+    info = NEMOTRON_V2_DATASETS[family]
+    return download_hf_step(
+        f"raw/{family}",
+        hf_dataset_id=info.hf_dataset_id,
+        revision=info.revision,
+    )

From d03a03a1096cf3872ba6b3a5ae59af6a03b27a04 Mon Sep 17 00:00:00 2001
From: Rafal Wojdyla <ravwojdyla@gmail.com>
Date: Wed, 25 Mar 2026 15:18:22 -0700
Subject: [PATCH 27/56] Rename nemotron step functions for consistency

download_nemotron_cc_step -> download_nemotron_v1_step
nemotron_v2_download_step -> download_nemotron_v2_step

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 experiments/pretraining_datasets/nemotron.py        | 4 ++--
 experiments/pretraining_datasets/nemotron_v2.py     | 4 ++--
 lib/marin/src/marin/datakit/download/nemotron_v1.py | 2 +-
 lib/marin/src/marin/datakit/download/nemotron_v2.py | 2 +-
 4 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/experiments/pretraining_datasets/nemotron.py b/experiments/pretraining_datasets/nemotron.py
index 35d3d86e2b..22fc4d1efa 100644
--- a/experiments/pretraining_datasets/nemotron.py
+++ b/experiments/pretraining_datasets/nemotron.py
@@ -8,14 +8,14 @@
 
 from experiments.defaults import DEFAULT_NEW_RUN_DATA_SHUFFLE
 from experiments.pretraining_datasets.dclm import dclm_components_llama3
-from marin.datakit.download.nemotron_v1 import download_nemotron_cc_step
+from marin.datakit.download.nemotron_v1 import download_nemotron_v1_step
 from marin.execution.executor import ExecutorStep, output_path_of, this_output_path, versioned
 from marin.processing.tokenize import TokenizeConfig, lm_mixture_data_config, tokenize
 from marin.processing.tokenize.data_configs import TokenizerStep
 
 # Raw dataset download step
 downloads = {
-    "nemotron_cc": download_nemotron_cc_step("raw/nemotro-cc").as_executor_step(),
+    "nemotron_cc": download_nemotron_v1_step("raw/nemotro-cc").as_executor_step(),
 }
 
 _nemotron_cc_path = output_path_of(downloads["nemotron_cc"], "contrib/Nemotron/Nemotron-CC/data-jsonl/")
diff --git a/experiments/pretraining_datasets/nemotron_v2.py b/experiments/pretraining_datasets/nemotron_v2.py
index b3cd1d6760..980b5edc90 100644
--- a/experiments/pretraining_datasets/nemotron_v2.py
+++ b/experiments/pretraining_datasets/nemotron_v2.py
@@ -10,7 +10,7 @@
 
 import os.path
 
-from marin.datakit.download.nemotron_v2 import NEMOTRON_V2_DATASETS, nemotron_v2_download_step
+from marin.datakit.download.nemotron_v2 import NEMOTRON_V2_DATASETS, download_nemotron_v2_step
 from marin.execution.executor import ExecutorStep, this_output_path, versioned
 from marin.processing.tokenize import TokenizeConfig, tokenize
 from marin.processing.tokenize.data_configs import TokenizerStep
@@ -20,7 +20,7 @@
 # ============================================================================
 
 downloads: dict[str, ExecutorStep] = {
-    family: nemotron_v2_download_step(family).as_executor_step() for family in NEMOTRON_V2_DATASETS
+    family: download_nemotron_v2_step(family).as_executor_step() for family in NEMOTRON_V2_DATASETS
 }
 
 
diff --git a/lib/marin/src/marin/datakit/download/nemotron_v1.py b/lib/marin/src/marin/datakit/download/nemotron_v1.py
index 8ba11e95b1..0befbf1883 100644
--- a/lib/marin/src/marin/datakit/download/nemotron_v1.py
+++ b/lib/marin/src/marin/datakit/download/nemotron_v1.py
@@ -108,7 +108,7 @@ def download_nemotron_cc(output_path: str) -> None:
     logger.info(f"Downloaded Nemotron CC files to {output_path}")
 
 
-def download_nemotron_cc_step(name: str = "raw/nemotron-cc") -> StepSpec:
+def download_nemotron_v1_step(name: str = "raw/nemotron-cc") -> StepSpec:
     """Create a StepSpec that downloads the Nemotron-CC dataset from Common Crawl."""
 
     return StepSpec(
diff --git a/lib/marin/src/marin/datakit/download/nemotron_v2.py b/lib/marin/src/marin/datakit/download/nemotron_v2.py
index 60b4f7902b..4c31f81ffa 100644
--- a/lib/marin/src/marin/datakit/download/nemotron_v2.py
+++ b/lib/marin/src/marin/datakit/download/nemotron_v2.py
@@ -112,7 +112,7 @@ class NemotronV2Dataset:
 }
 
 
-def nemotron_v2_download_step(family: str) -> StepSpec:
+def download_nemotron_v2_step(family: str) -> StepSpec:
     """Create a download StepSpec for a Nemotron v2 dataset family."""
     info = NEMOTRON_V2_DATASETS[family]
     return download_hf_step(

From 39fe0d16624882dc986cc7d01325047212126dca Mon Sep 17 00:00:00 2001
From: Rafal Wojdyla <ravwojdyla@gmail.com>
Date: Wed, 25 Mar 2026 15:19:30 -0700
Subject: [PATCH 28/56] Remove unnecessary __all__ from uncheatable_eval module

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 lib/marin/src/marin/datakit/download/uncheatable_eval.py | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/lib/marin/src/marin/datakit/download/uncheatable_eval.py b/lib/marin/src/marin/datakit/download/uncheatable_eval.py
index 0bcdef3439..f009ba158c 100644
--- a/lib/marin/src/marin/datakit/download/uncheatable_eval.py
+++ b/lib/marin/src/marin/datakit/download/uncheatable_eval.py
@@ -427,12 +427,3 @@ def make_uncheatable_eval_step(
         github_token=github_token,
         skip_existing=skip_existing,
     ).as_executor_step()
-
-
-__all__ = [
-    "UncheatableEvalDataset",
-    "UncheatableEvalDownloadConfig",
-    "download_latest_uncheatable_eval",
-    "make_uncheatable_eval_step",
-    "uncheatable_eval_step",
-]

From c53407f5ff0ba04eaf2104bddb82d3299b6bdce4 Mon Sep 17 00:00:00 2001
From: Rafal Wojdyla <ravwojdyla@gmail.com>
Date: Wed, 25 Mar 2026 15:20:46 -0700
Subject: [PATCH 29/56] Remove unused wikipedia_step function
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

No consumers — download() is called directly via ExecutorStep/CLI.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .../src/marin/datakit/download/wikipedia.py   | 25 -------------------
 1 file changed, 25 deletions(-)

diff --git a/lib/marin/src/marin/datakit/download/wikipedia.py b/lib/marin/src/marin/datakit/download/wikipedia.py
index 1dce125a0f..cfd919fae6 100644
--- a/lib/marin/src/marin/datakit/download/wikipedia.py
+++ b/lib/marin/src/marin/datakit/download/wikipedia.py
@@ -35,7 +35,6 @@
 import draccus
 import requests
 from iris.marin_fs import open_url
-from marin.execution.step_spec import StepSpec
 from marin.utils import fsspec_size
 from tqdm_loggable.auto import tqdm
 from zephyr import Dataset, ZephyrContext, atomic_rename, load_jsonl
@@ -124,27 +123,3 @@ def download(cfg: WikipediaDownloadConfig) -> None:
     )
 
     logger.info("Wikipedia dump transfer complete, wrote: %s", list(extracted))
-
-
-def wikipedia_step(
-    name: str = "raw/wikipedia",
-    *,
-    input_urls: list[str],
-    revision: str,
-    deps: list[StepSpec] | None = None,
-    output_path_prefix: str | None = None,
-    override_output_path: str | None = None,
-) -> StepSpec:
-    """Create a StepSpec that downloads and processes Wikipedia HTML dumps."""
-
-    def _run(output_path: str) -> None:
-        download(WikipediaDownloadConfig(input_urls=input_urls, revision=revision, output_path=output_path))
-
-    return StepSpec(
-        name=name,
-        fn=_run,
-        deps=deps or [],
-        hash_attrs={"input_urls": input_urls, "revision": revision},
-        output_path_prefix=output_path_prefix,
-        override_output_path=override_output_path,
-    )

From a85e541f3fa839876e3d46cd71d927ec060b87be Mon Sep 17 00:00:00 2001
From: Rafal Wojdyla <ravwojdyla@gmail.com>
Date: Wed, 25 Mar 2026 15:24:07 -0700
Subject: [PATCH 30/56] Flatten wikipedia download to plain parameters, remove
 draccus CLI

download_wikipedia() now takes (input_urls, revision, output_path)
directly. Removes WikipediaDownloadConfig, draccus decorator, and
CLI entry point.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .../src/marin/datakit/download/wikipedia.py   | 26 +++++--------------
 1 file changed, 7 insertions(+), 19 deletions(-)

diff --git a/lib/marin/src/marin/datakit/download/wikipedia.py b/lib/marin/src/marin/datakit/download/wikipedia.py
index cfd919fae6..ec51c62b0b 100644
--- a/lib/marin/src/marin/datakit/download/wikipedia.py
+++ b/lib/marin/src/marin/datakit/download/wikipedia.py
@@ -2,8 +2,6 @@
 # SPDX-License-Identifier: Apache-2.0
 
 """
-wikipedia/download.py
-
 Download script for the Wikipedia raw HTML data, provided by Wikimedia.
 
 Home Page: https://dumps.wikimedia.org/other/enterprise_html/runs/
@@ -11,14 +9,14 @@
 Example Usage (production, large dataset):
 ENWIKI=https://dumps.wikimedia.org/other/enterprise_html/runs/20250320/enwiki-NS0-20250320-ENTERPRISE-HTML.json.tar.gz
 uv run zephyr --backend=ray --max-parallelism=10 \
-    lib/marin/src/marin/download/wikipedia/download.py \
+    lib/marin/src/marin/datakit/download/wikipedia.py \
     --input_urls $ENWIKI \
     --revision 20250320 --output_path gs://path/to/output
 
 Example Usage (local testing, small dataset):
 SIMPLEWIKI=https://dumps.wikimedia.org/other/enterprise_html/runs/20250320/simplewiki-NS0-20250320-ENTERPRISE-HTML.json.tar.gz
-uv run zephyr --backend=threadpool --max-parallelism=4 --entry-point=download \
-    lib/marin/src/marin/download/wikipedia/download.py \
+uv run zephyr --backend=threadpool --max-parallelism=4 --entry-point=main \
+    lib/marin/src/marin/datakit/download/wikipedia.py \
     --input_urls "[$SIMPLEWIKI]" \
     --revision 20250320 --output_path /tmp/wikipedia_test
 
@@ -30,9 +28,7 @@
 import os
 import tarfile
 from collections.abc import Iterable
-from dataclasses import dataclass
 
-import draccus
 import requests
 from iris.marin_fs import open_url
 from marin.utils import fsspec_size
@@ -42,14 +38,7 @@
 logger = logging.getLogger(__name__)
 
 
-@dataclass
-class WikipediaDownloadConfig:
-    input_urls: list[str]
-    revision: str
-    output_path: str
-
-
-def download_tar(url: str, output_prefix) -> str:
+def download_tar(url: str, output_prefix: str) -> str:
     shard_filename = url.split("/")[-1]
     output_filename = os.path.join(output_prefix, shard_filename)
     logger.info(f"Downloading URL: {url} to {output_filename}")
@@ -100,15 +89,14 @@ def process_file(input_file: str, output_path: str) -> Iterable[str]:
         raise e
 
 
-@draccus.wrap()
-def download(cfg: WikipediaDownloadConfig) -> None:
+def download_wikipedia(input_urls: list[str], revision: str, output_path: str) -> None:
     """Download and process Wikipedia data."""
     logger.info("Starting transfer of Wikipedia dump...")
-    output_base = os.path.join(cfg.output_path, cfg.revision)
+    output_base = os.path.join(output_path, revision)
 
     ctx = ZephyrContext(name="download-wikipedia")
     download_metrics = ctx.execute(
-        Dataset.from_list(cfg.input_urls)
+        Dataset.from_list(input_urls)
         .map(lambda url: download_tar(url, output_base))
         .write_jsonl(f"{output_base}/.metrics/download-{{shard:05d}}.jsonl", skip_existing=True),
     )

From e7ac5bec38516d1a0dbe906050a587e4d606dda8 Mon Sep 17 00:00:00 2001
From: Rafal Wojdyla <ravwojdyla@gmail.com>
Date: Wed, 25 Mar 2026 15:25:56 -0700
Subject: [PATCH 31/56] Remove unused draccus CLI from huggingface download
 module

Not invoked anywhere in the codebase.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 lib/marin/src/marin/datakit/download/huggingface.py | 11 -----------
 1 file changed, 11 deletions(-)

diff --git a/lib/marin/src/marin/datakit/download/huggingface.py b/lib/marin/src/marin/datakit/download/huggingface.py
index 6a6ff13cd2..fff532a017 100644
--- a/lib/marin/src/marin/datakit/download/huggingface.py
+++ b/lib/marin/src/marin/datakit/download/huggingface.py
@@ -14,7 +14,6 @@
 import time
 from dataclasses import dataclass, field
 
-import draccus
 import huggingface_hub
 from huggingface_hub import HfFileSystem
 from iris.marin_fs import open_url, url_to_fs
@@ -397,13 +396,3 @@ def _run(output_path: str) -> None:
         output_path_prefix=output_path_prefix,
         override_output_path=override_output_path,
     )
-
-
-@draccus.wrap()
-def main(cfg: DownloadConfig) -> None:
-    """Download HuggingFace dataset."""
-    download_hf(cfg)
-
-
-if __name__ == "__main__":
-    main()

From 0c7587aa8c9ee40566f6975d5a93c42603b5f9a2 Mon Sep 17 00:00:00 2001
From: Rafal Wojdyla <ravwojdyla@gmail.com>
Date: Wed, 25 Mar 2026 15:28:04 -0700
Subject: [PATCH 32/56] Remove backward-compat aliases from
 datakit/download/__init__.py

Clean up the download_step alias and __all__. The one consumer
(test_datakit.py) now imports download_hf_step directly.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 lib/marin/src/marin/datakit/download/__init__.py | 16 ----------------
 tests/datakit/test_datakit.py                    |  4 ++--
 2 files changed, 2 insertions(+), 18 deletions(-)

diff --git a/lib/marin/src/marin/datakit/download/__init__.py b/lib/marin/src/marin/datakit/download/__init__.py
index cc14fdbdf4..ec8bc038b7 100644
--- a/lib/marin/src/marin/datakit/download/__init__.py
+++ b/lib/marin/src/marin/datakit/download/__init__.py
@@ -1,18 +1,2 @@
 # Copyright The Marin Authors
 # SPDX-License-Identifier: Apache-2.0
-
-from marin.datakit.download.huggingface import (
-    DownloadConfig,
-    download_hf,
-    download_hf_step,
-)
-
-# Backward-compat alias: download_step was the original name in the single-file module.
-download_step = download_hf_step
-
-__all__ = [
-    "DownloadConfig",
-    "download_hf",
-    "download_hf_step",
-    "download_step",
-]
diff --git a/tests/datakit/test_datakit.py b/tests/datakit/test_datakit.py
index 1c29e35a9c..0c126e6074 100644
--- a/tests/datakit/test_datakit.py
+++ b/tests/datakit/test_datakit.py
@@ -10,7 +10,7 @@
 import pytest
 from levanter.store.cache import CacheLedger, TreeCache
 
-from marin.datakit.download import download_step
+from marin.datakit.download.huggingface import download_hf_step
 from marin.datakit.normalize import content_hash_id, normalize_step
 from marin.datakit.tokenize import tokenize_step
 from marin.execution.step_runner import StepRunner
@@ -20,7 +20,7 @@
 def test_download_normalize_tokenize(tmp_path):
     """Download → normalize → tokenize as a StepSpec DAG via StepRunner."""
 
-    dl = download_step(
+    dl = download_hf_step(
         "datakit/download",
         hf_dataset_id="wikitext",
         revision="main",

From aa2252d81b26a841567fdc8c86c7e7fdb525c1a0 Mon Sep 17 00:00:00 2001
From: Rafal Wojdyla <ravwojdyla@gmail.com>
Date: Wed, 25 Mar 2026 15:29:44 -0700
Subject: [PATCH 33/56] Remove output_path_prefix from download_hf_step
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Simplify the API — override_output_path with relative path support
(auto-prefixed by marin_prefix) is sufficient.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 lib/marin/src/marin/datakit/download/huggingface.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/lib/marin/src/marin/datakit/download/huggingface.py b/lib/marin/src/marin/datakit/download/huggingface.py
index fff532a017..c414df96a9 100644
--- a/lib/marin/src/marin/datakit/download/huggingface.py
+++ b/lib/marin/src/marin/datakit/download/huggingface.py
@@ -351,7 +351,6 @@ def download_hf_step(
     hf_urls_glob: list[str] | None = None,
     zephyr_max_parallelism: int = 8,
     deps: list[StepSpec] | None = None,
-    output_path_prefix: str | None = None,
     override_output_path: str | None = None,
 ) -> StepSpec:
     """Create a StepSpec that downloads a HuggingFace dataset.
@@ -365,7 +364,6 @@ def download_hf_step(
         hf_urls_glob: Glob patterns to select specific files. Empty means all files.
         zephyr_max_parallelism: Maximum download parallelism.
         deps: Optional upstream dependencies.
-        output_path_prefix: Override the default output path prefix.
         override_output_path: Override the computed output path entirely.
 
     Returns:
@@ -393,6 +391,5 @@ def _run(output_path: str) -> None:
             "revision": revision,
             "hf_urls_glob": resolved_glob,
         },
-        output_path_prefix=output_path_prefix,
         override_output_path=override_output_path,
     )

From 55565769a0fc7b69caa088e60f7bd0e58448a94a Mon Sep 17 00:00:00 2001
From: Rafal Wojdyla <ravwojdyla@gmail.com>
Date: Wed, 25 Mar 2026 15:42:44 -0700
Subject: [PATCH 34/56] Remove unused datakit/tokenize.py module

Only consumer was the integration test, which now uses StepSpec
with TokenizeConfig directly.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 lib/marin/src/marin/datakit/tokenize.py | 71 -------------------------
 tests/datakit/test_datakit.py           | 19 +++++--
 2 files changed, 14 insertions(+), 76 deletions(-)
 delete mode 100644 lib/marin/src/marin/datakit/tokenize.py

diff --git a/lib/marin/src/marin/datakit/tokenize.py b/lib/marin/src/marin/datakit/tokenize.py
deleted file mode 100644
index 0e5c9b4168..0000000000
--- a/lib/marin/src/marin/datakit/tokenize.py
+++ /dev/null
@@ -1,71 +0,0 @@
-# Copyright The Marin Authors
-# SPDX-License-Identifier: Apache-2.0
-
-"""Datakit tokenize stage — convert normalized Parquet datasets into Levanter cache format.
-
-This is the final stage of the datakit pipeline. It reads normalized Parquet
-files and produces tokenized training data in Levanter's TreeStore format.
-
-Tokenization is the boundary where per-document structure ends. The tokenizer
-concatenates documents into fixed-size token sequences for efficient training.
-"""
-
-import logging
-
-from marin.execution.step_spec import StepSpec
-from marin.processing.tokenize.tokenize import TokenizeConfig, tokenize
-
-logger = logging.getLogger(__name__)
-
-
-def tokenize_step(
-    name: str,
-    *,
-    input_path: str,
-    tokenizer: str,
-    max_workers: int = 4096,
-    deps: list[StepSpec] | None = None,
-    output_path_prefix: str | None = None,
-    override_output_path: str | None = None,
-) -> StepSpec:
-    """Create a StepSpec that tokenizes a normalized dataset.
-
-    Reads normalized Parquet files and produces Levanter cache format output
-    suitable for training.
-
-    Args:
-        name: Step name (e.g. "fineweb/tokenize").
-        input_path: Path to normalized Parquet files (output of normalize step).
-        tokenizer: HuggingFace tokenizer name (e.g. "meta-llama/Llama-3.1-8B").
-        max_workers: Maximum Zephyr worker parallelism.
-        deps: Upstream dependencies (typically the normalize or consolidate step).
-        output_path_prefix: Override the default output path prefix.
-        override_output_path: Override the computed output path entirely.
-
-    Returns:
-        A StepSpec whose output_path contains the tokenized Levanter cache.
-    """
-
-    def _run(output_path: str) -> None:
-        tokenize(
-            TokenizeConfig(
-                train_paths=[input_path],
-                validation_paths=[],
-                cache_path=output_path,
-                tokenizer=tokenizer,
-                max_workers=max_workers,
-                allow_test_in_train=True,
-            )
-        )
-
-    return StepSpec(
-        name=name,
-        fn=_run,
-        deps=deps or [],
-        hash_attrs={
-            "input_path": input_path,
-            "tokenizer": tokenizer,
-        },
-        output_path_prefix=output_path_prefix,
-        override_output_path=override_output_path,
-    )
diff --git a/tests/datakit/test_datakit.py b/tests/datakit/test_datakit.py
index 0c126e6074..184b0c6230 100644
--- a/tests/datakit/test_datakit.py
+++ b/tests/datakit/test_datakit.py
@@ -12,8 +12,9 @@
 
 from marin.datakit.download.huggingface import download_hf_step
 from marin.datakit.normalize import content_hash_id, normalize_step
-from marin.datakit.tokenize import tokenize_step
 from marin.execution.step_runner import StepRunner
+from marin.execution.step_spec import StepSpec
+from marin.processing.tokenize.tokenize import TokenizeConfig, tokenize
 
 
 @pytest.mark.slow
@@ -35,11 +36,19 @@ def test_download_normalize_tokenize(tmp_path):
         override_output_path=str(tmp_path / "normalized"),
     )
 
-    tok = tokenize_step(
-        "datakit/tokenize",
-        input_path=norm.output_path,
-        tokenizer="gpt2",
+    tok = StepSpec(
+        name="datakit/tokenize",
+        fn=lambda output_path: tokenize(
+            TokenizeConfig(
+                train_paths=[norm.output_path],
+                validation_paths=[],
+                cache_path=output_path,
+                tokenizer="gpt2",
+                allow_test_in_train=True,
+            )
+        ),
         deps=[norm],
+        hash_attrs={"tokenizer": "gpt2"},
         override_output_path=str(tmp_path / "tokenized"),
     )
 

From f2983ba0114fae42199d23b9412d5af8e3c0d7b7 Mon Sep 17 00:00:00 2001
From: Rafal Wojdyla <ravwojdyla@gmail.com>
Date: Wed, 25 Mar 2026 15:44:52 -0700
Subject: [PATCH 35/56] Remove unused datakit/normalize.py module
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

No production consumers. Simplify integration test to download →
tokenize only.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 lib/marin/src/marin/datakit/normalize.py | 194 -----------------------
 tests/datakit/test_datakit.py            |  36 +----
 2 files changed, 6 insertions(+), 224 deletions(-)
 delete mode 100644 lib/marin/src/marin/datakit/normalize.py

diff --git a/lib/marin/src/marin/datakit/normalize.py b/lib/marin/src/marin/datakit/normalize.py
deleted file mode 100644
index bace847696..0000000000
--- a/lib/marin/src/marin/datakit/normalize.py
+++ /dev/null
@@ -1,194 +0,0 @@
-# Copyright The Marin Authors
-# SPDX-License-Identifier: Apache-2.0
-
-"""Datakit normalize stage — convert raw data into the datakit standard Parquet format.
-
-The normalize step is the "intake" for the datakit pipeline. It reads raw files
-(JSONL, Parquet, or other formats supported by Zephyr), enforces a standard
-schema (mandatory ``id`` and ``text`` columns), and writes co-partitioned,
-sorted Parquet files.
-
-Key guarantees after normalization:
-- Every record has a deterministic ``id`` (SHA-256 of the text content).
-- If the source data has an existing ID field, it is preserved as ``source_id``.
-- Text is present and UTF-8 encoded.
-- Each output partition is sorted by ``id``.
-- Output files follow the ``part-{shard:05d}-of-{total:05d}.parquet`` naming convention.
-"""
-
-import hashlib
-import logging
-import os
-from collections.abc import Iterator
-
-from marin.execution.artifact import PathsMetadata
-from marin.execution.step_spec import StepSpec
-from marin.utils import fsspec_glob
-from zephyr import Dataset, ShardInfo, ZephyrContext
-from zephyr.readers import load_file
-
-logger = logging.getLogger(__name__)
-
-DEFAULT_TEXT_FIELD = "text"
-
-
-def content_hash_id(text: str) -> str:
-    """Generate a deterministic document ID from text content.
-
-    Uses SHA-256 truncated to 16 hex characters for a compact but
-    collision-resistant identifier.
-    """
-    return hashlib.sha256(text.encode("utf-8")).hexdigest()[:16]
-
-
-def _discover_input_files(input_path: str) -> list[str]:
-    """Find all supported input files under input_path, excluding dotfiles/directories."""
-    extensions = ["jsonl.gz", "jsonl.zst", "jsonl.zstd", "jsonl", "parquet", "vortex"]
-    files: list[str] = []
-    for ext in extensions:
-        files.extend(fsspec_glob(os.path.join(input_path, f"**/*.{ext}")))
-    # Exclude hidden directories (e.g. .metrics/ written by download_hf)
-    files = [f for f in files if "/." not in f.split(input_path, 1)[-1]]
-    if not files:
-        raise ValueError(f"No supported input files found under {input_path}")
-    return sorted(files)
-
-
-def _normalize_record(record: dict, text_field: str, source_id_field: str | None) -> dict:
-    """Transform a single record into datakit standard format.
-
-    - Extracts and renames the text field to ``text``.
-    - Generates a deterministic ``id`` from the text content.
-    - Preserves the original ID (if any) as ``source_id``.
-    - Preserves all other fields.
-    """
-    text = record.get(text_field)
-    if text is None:
-        raise ValueError(f"Record missing required text field {text_field!r}: {list(record.keys())}")
-    if not isinstance(text, str):
-        text = str(text)
-
-    doc_id = content_hash_id(text)
-
-    normalized: dict = {"id": doc_id, "text": text}
-
-    if source_id_field is not None and source_id_field in record:
-        normalized["source_id"] = str(record[source_id_field])
-
-    # Preserve additional columns
-    skip_fields = {text_field, source_id_field} if source_id_field else {text_field}
-    for key, value in record.items():
-        if key not in skip_fields and key not in normalized:
-            normalized[key] = value
-
-    return normalized
-
-
-def normalize(
-    input_path: str,
-    output_path: str,
-    *,
-    text_field: str = DEFAULT_TEXT_FIELD,
-    source_id_field: str | None = None,
-    num_output_shards: int | None = None,
-    zephyr_max_workers: int = 64,
-) -> PathsMetadata:
-    """Run the normalize pipeline.
-
-    Reads raw files, transforms each record to the standard schema,
-    repartitions by ``id`` (hash-based), deduplicates, sorts each partition
-    by ``id``, and writes Parquet output files.
-
-    Args:
-        input_path: Path to raw input files.
-        output_path: Directory to write output Parquet files.
-        text_field: Name of the field containing the primary text content.
-        source_id_field: Name of an existing ID field to preserve as ``source_id``.
-        num_output_shards: Number of output Parquet partitions. Defaults to
-            the number of input files.
-        zephyr_max_workers: Maximum Zephyr worker parallelism.
-
-    Returns:
-        PathsMetadata listing the output files.
-    """
-    input_files = _discover_input_files(input_path)
-    logger.info("Normalizing %d input files from %s", len(input_files), input_path)
-
-    shards = num_output_shards or len(input_files)
-
-    def _sort_shard(records: Iterator[dict], _shard_info: ShardInfo) -> Iterator[dict]:
-        batch = list(records)
-        batch.sort(key=lambda r: r["id"])
-        return iter(batch)
-
-    output_pattern = os.path.join(output_path, "part-{shard:05d}-of-{total:05d}.parquet")
-    pipeline = (
-        Dataset.from_list(input_files)
-        .flat_map(load_file)
-        .map(lambda r: _normalize_record(r, text_field, source_id_field))
-        .group_by(
-            key=lambda r: r["id"],
-            reducer=lambda _key, records: next(iter(records)),
-            num_output_shards=shards,
-        )
-        .map_shard(_sort_shard)
-        .write_parquet(output_pattern)
-    )
-
-    ctx = ZephyrContext(name="datakit-normalize", max_workers=min(zephyr_max_workers, shards))
-    output_files = list(ctx.execute(pipeline))
-    logger.info("Wrote %d normalized Parquet partitions to %s", len(output_files), output_path)
-    return PathsMetadata(parent_path=output_path, paths=output_files)
-
-
-def normalize_step(
-    name: str,
-    *,
-    input_path: str,
-    text_field: str = DEFAULT_TEXT_FIELD,
-    source_id_field: str | None = None,
-    num_output_shards: int | None = None,
-    zephyr_max_workers: int = 64,
-    deps: list[StepSpec] | None = None,
-    output_path_prefix: str | None = None,
-    override_output_path: str | None = None,
-) -> StepSpec:
-    """Create a StepSpec for the normalize stage.
-
-    Args:
-        name: Step name (e.g. "fineweb/normalize").
-        input_path: Path to raw input files.
-        text_field: Name of the field containing the primary text content.
-        source_id_field: Name of an existing ID field to preserve as ``source_id``.
-        num_output_shards: Number of output Parquet partitions.
-        zephyr_max_workers: Maximum Zephyr worker parallelism.
-        deps: Upstream dependencies (typically the download step).
-        output_path_prefix: Override the default output path prefix.
-        override_output_path: Override the computed output path entirely.
-
-    Returns:
-        A StepSpec whose output_path contains normalized Parquet files.
-    """
-
-    def _run(step_output_path: str) -> PathsMetadata:
-        return normalize(
-            input_path,
-            step_output_path,
-            text_field=text_field,
-            source_id_field=source_id_field,
-            num_output_shards=num_output_shards,
-            zephyr_max_workers=zephyr_max_workers,
-        )
-
-    return StepSpec(
-        name=name,
-        fn=_run,
-        deps=deps or [],
-        hash_attrs={
-            "input_path": input_path,
-            "text_field": text_field,
-            "source_id_field": source_id_field,
-        },
-        output_path_prefix=output_path_prefix,
-        override_output_path=override_output_path,
-    )
diff --git a/tests/datakit/test_datakit.py b/tests/datakit/test_datakit.py
index 184b0c6230..8a9286763b 100644
--- a/tests/datakit/test_datakit.py
+++ b/tests/datakit/test_datakit.py
@@ -1,25 +1,23 @@
 # Copyright The Marin Authors
 # SPDX-License-Identifier: Apache-2.0
 
-"""Integration test for the datakit pipeline: download → normalize → tokenize, wired as StepSpecs."""
+"""Integration test for the datakit pipeline: download → tokenize, wired as StepSpecs."""
 
 from pathlib import Path
 
 import numpy as np
-import pyarrow.parquet as pq
 import pytest
 from levanter.store.cache import CacheLedger, TreeCache
 
 from marin.datakit.download.huggingface import download_hf_step
-from marin.datakit.normalize import content_hash_id, normalize_step
 from marin.execution.step_runner import StepRunner
 from marin.execution.step_spec import StepSpec
 from marin.processing.tokenize.tokenize import TokenizeConfig, tokenize
 
 
 @pytest.mark.slow
-def test_download_normalize_tokenize(tmp_path):
-    """Download → normalize → tokenize as a StepSpec DAG via StepRunner."""
+def test_download_and_tokenize(tmp_path):
+    """Download → tokenize as a StepSpec DAG via StepRunner."""
 
     dl = download_hf_step(
         "datakit/download",
@@ -29,50 +27,28 @@ def test_download_normalize_tokenize(tmp_path):
         override_output_path=str(tmp_path / "raw"),
     )
 
-    norm = normalize_step(
-        "datakit/normalize",
-        input_path=dl.output_path,
-        deps=[dl],
-        override_output_path=str(tmp_path / "normalized"),
-    )
-
     tok = StepSpec(
         name="datakit/tokenize",
         fn=lambda output_path: tokenize(
             TokenizeConfig(
-                train_paths=[norm.output_path],
+                train_paths=[dl.output_path],
                 validation_paths=[],
                 cache_path=output_path,
                 tokenizer="gpt2",
                 allow_test_in_train=True,
             )
         ),
-        deps=[norm],
+        deps=[dl],
         hash_attrs={"tokenizer": "gpt2"},
         override_output_path=str(tmp_path / "tokenized"),
     )
 
-    StepRunner().run([dl, norm, tok])
+    StepRunner().run([dl, tok])
 
     # -- Verify download output --
     raw_files = [f for f in Path(dl.output_path).rglob("*") if f.is_file() and not f.name.startswith(".")]
     assert len(raw_files) >= 1
 
-    # -- Verify normalize output --
-    parquet_files = sorted(Path(norm.output_path).glob("*.parquet"))
-    assert len(parquet_files) >= 1
-
-    all_records = []
-    for pf in parquet_files:
-        records = pq.read_table(str(pf)).to_pylist()
-        all_records.extend(records)
-        ids = [r["id"] for r in records]
-        assert ids == sorted(ids), f"Partition {pf.name} not sorted by id"
-
-    assert len(all_records) > 0
-    for record in all_records:
-        assert record["id"] == content_hash_id(record["text"])
-
     # -- Verify tokenize output --
     train_dir = Path(tok.output_path) / "train"
     ledger = CacheLedger.load(str(train_dir))

From 764e117823fce6c2f68584d852e339c6e1b5832b Mon Sep 17 00:00:00 2001
From: Rafal Wojdyla <ravwojdyla@gmail.com>
Date: Wed, 25 Mar 2026 15:50:18 -0700
Subject: [PATCH 36/56] Move tests/download/ to tests/datakit/download/

Mirrors the source code location at marin.datakit.download.*.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 tests/datakit/{ => download}/__init__.py      |  0
 tests/{ => datakit}/download/conftest.py      |  0
 tests/{ => datakit}/download/test_ar5iv.py    |  0
 tests/{ => datakit}/download/test_dclm_hq.py  |  0
 .../download/test_huggingface.py              |  0
 .../download/test_nemotron_cc.py              |  0
 tests/datakit/test_datakit.py                 | 61 -------------------
 7 files changed, 61 deletions(-)
 rename tests/datakit/{ => download}/__init__.py (100%)
 rename tests/{ => datakit}/download/conftest.py (100%)
 rename tests/{ => datakit}/download/test_ar5iv.py (100%)
 rename tests/{ => datakit}/download/test_dclm_hq.py (100%)
 rename tests/{ => datakit}/download/test_huggingface.py (100%)
 rename tests/{ => datakit}/download/test_nemotron_cc.py (100%)
 delete mode 100644 tests/datakit/test_datakit.py

diff --git a/tests/datakit/__init__.py b/tests/datakit/download/__init__.py
similarity index 100%
rename from tests/datakit/__init__.py
rename to tests/datakit/download/__init__.py
diff --git a/tests/download/conftest.py b/tests/datakit/download/conftest.py
similarity index 100%
rename from tests/download/conftest.py
rename to tests/datakit/download/conftest.py
diff --git a/tests/download/test_ar5iv.py b/tests/datakit/download/test_ar5iv.py
similarity index 100%
rename from tests/download/test_ar5iv.py
rename to tests/datakit/download/test_ar5iv.py
diff --git a/tests/download/test_dclm_hq.py b/tests/datakit/download/test_dclm_hq.py
similarity index 100%
rename from tests/download/test_dclm_hq.py
rename to tests/datakit/download/test_dclm_hq.py
diff --git a/tests/download/test_huggingface.py b/tests/datakit/download/test_huggingface.py
similarity index 100%
rename from tests/download/test_huggingface.py
rename to tests/datakit/download/test_huggingface.py
diff --git a/tests/download/test_nemotron_cc.py b/tests/datakit/download/test_nemotron_cc.py
similarity index 100%
rename from tests/download/test_nemotron_cc.py
rename to tests/datakit/download/test_nemotron_cc.py
diff --git a/tests/datakit/test_datakit.py b/tests/datakit/test_datakit.py
deleted file mode 100644
index 8a9286763b..0000000000
--- a/tests/datakit/test_datakit.py
+++ /dev/null
@@ -1,61 +0,0 @@
-# Copyright The Marin Authors
-# SPDX-License-Identifier: Apache-2.0
-
-"""Integration test for the datakit pipeline: download → tokenize, wired as StepSpecs."""
-
-from pathlib import Path
-
-import numpy as np
-import pytest
-from levanter.store.cache import CacheLedger, TreeCache
-
-from marin.datakit.download.huggingface import download_hf_step
-from marin.execution.step_runner import StepRunner
-from marin.execution.step_spec import StepSpec
-from marin.processing.tokenize.tokenize import TokenizeConfig, tokenize
-
-
-@pytest.mark.slow
-def test_download_and_tokenize(tmp_path):
-    """Download → tokenize as a StepSpec DAG via StepRunner."""
-
-    dl = download_hf_step(
-        "datakit/download",
-        hf_dataset_id="wikitext",
-        revision="main",
-        hf_urls_glob=["wikitext-2-v1/test-*.parquet"],
-        override_output_path=str(tmp_path / "raw"),
-    )
-
-    tok = StepSpec(
-        name="datakit/tokenize",
-        fn=lambda output_path: tokenize(
-            TokenizeConfig(
-                train_paths=[dl.output_path],
-                validation_paths=[],
-                cache_path=output_path,
-                tokenizer="gpt2",
-                allow_test_in_train=True,
-            )
-        ),
-        deps=[dl],
-        hash_attrs={"tokenizer": "gpt2"},
-        override_output_path=str(tmp_path / "tokenized"),
-    )
-
-    StepRunner().run([dl, tok])
-
-    # -- Verify download output --
-    raw_files = [f for f in Path(dl.output_path).rglob("*") if f.is_file() and not f.name.startswith(".")]
-    assert len(raw_files) >= 1
-
-    # -- Verify tokenize output --
-    train_dir = Path(tok.output_path) / "train"
-    ledger = CacheLedger.load(str(train_dir))
-    assert ledger.is_finished
-    assert ledger.total_num_rows > 0
-
-    exemplar = {"input_ids": np.array([0], dtype=np.int32)}
-    cache = TreeCache.load(str(train_dir), exemplar=exemplar)
-    assert len(cache) == ledger.total_num_rows
-    assert len(cache[0]["input_ids"]) > 0

From 05d58e29104cbff8ea18ff7da4552d53b0d52ef0 Mon Sep 17 00:00:00 2001
From: Rafal Wojdyla <ravwojdyla@gmail.com>
Date: Wed, 25 Mar 2026 15:50:59 -0700
Subject: [PATCH 37/56] Restore tests/datakit/__init__.py and test_datakit.py

The previous commit accidentally removed these when moving the
download tests.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 tests/datakit/__init__.py     |  2 ++
 tests/datakit/test_datakit.py | 61 +++++++++++++++++++++++++++++++++++
 2 files changed, 63 insertions(+)
 create mode 100644 tests/datakit/__init__.py
 create mode 100644 tests/datakit/test_datakit.py

diff --git a/tests/datakit/__init__.py b/tests/datakit/__init__.py
new file mode 100644
index 0000000000..ec8bc038b7
--- /dev/null
+++ b/tests/datakit/__init__.py
@@ -0,0 +1,2 @@
+# Copyright The Marin Authors
+# SPDX-License-Identifier: Apache-2.0
diff --git a/tests/datakit/test_datakit.py b/tests/datakit/test_datakit.py
new file mode 100644
index 0000000000..8a9286763b
--- /dev/null
+++ b/tests/datakit/test_datakit.py
@@ -0,0 +1,61 @@
+# Copyright The Marin Authors
+# SPDX-License-Identifier: Apache-2.0
+
+"""Integration test for the datakit pipeline: download → tokenize, wired as StepSpecs."""
+
+from pathlib import Path
+
+import numpy as np
+import pytest
+from levanter.store.cache import CacheLedger, TreeCache
+
+from marin.datakit.download.huggingface import download_hf_step
+from marin.execution.step_runner import StepRunner
+from marin.execution.step_spec import StepSpec
+from marin.processing.tokenize.tokenize import TokenizeConfig, tokenize
+
+
+@pytest.mark.slow
+def test_download_and_tokenize(tmp_path):
+    """Download → tokenize as a StepSpec DAG via StepRunner."""
+
+    dl = download_hf_step(
+        "datakit/download",
+        hf_dataset_id="wikitext",
+        revision="main",
+        hf_urls_glob=["wikitext-2-v1/test-*.parquet"],
+        override_output_path=str(tmp_path / "raw"),
+    )
+
+    tok = StepSpec(
+        name="datakit/tokenize",
+        fn=lambda output_path: tokenize(
+            TokenizeConfig(
+                train_paths=[dl.output_path],
+                validation_paths=[],
+                cache_path=output_path,
+                tokenizer="gpt2",
+                allow_test_in_train=True,
+            )
+        ),
+        deps=[dl],
+        hash_attrs={"tokenizer": "gpt2"},
+        override_output_path=str(tmp_path / "tokenized"),
+    )
+
+    StepRunner().run([dl, tok])
+
+    # -- Verify download output --
+    raw_files = [f for f in Path(dl.output_path).rglob("*") if f.is_file() and not f.name.startswith(".")]
+    assert len(raw_files) >= 1
+
+    # -- Verify tokenize output --
+    train_dir = Path(tok.output_path) / "train"
+    ledger = CacheLedger.load(str(train_dir))
+    assert ledger.is_finished
+    assert ledger.total_num_rows > 0
+
+    exemplar = {"input_ids": np.array([0], dtype=np.int32)}
+    cache = TreeCache.load(str(train_dir), exemplar=exemplar)
+    assert len(cache) == ledger.total_num_rows
+    assert len(cache[0]["input_ids"]) > 0

From bed10156a0356c1b7c90bdd75ca1680795a2e758 Mon Sep 17 00:00:00 2001
From: Rafal Wojdyla <ravwojdyla@gmail.com>
Date: Wed, 25 Mar 2026 16:07:26 -0700
Subject: [PATCH 38/56] Replace nemotron downloads dict with
 nemotron_cc_download variable

The single-entry dict was unnecessary indirection. All consumers
updated to reference the variable directly.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 experiments/pretraining_datasets/__init__.py        | 4 ++--
 experiments/pretraining_datasets/nemotron.py        | 7 ++-----
 experiments/train_test_overlap/train_test_total.py  | 4 ++--
 lib/marin/src/marin/datakit/download/nemotron_v1.py | 6 ++++--
 4 files changed, 10 insertions(+), 11 deletions(-)

diff --git a/experiments/pretraining_datasets/__init__.py b/experiments/pretraining_datasets/__init__.py
index 6ca2bff80f..79d651252b 100644
--- a/experiments/pretraining_datasets/__init__.py
+++ b/experiments/pretraining_datasets/__init__.py
@@ -37,7 +37,7 @@
     NEMOTRON_DATASETS,
     NEMOTRON_LLAMA3_OVERRIDES,
     NEMOTRON_WEIGHTS,
-    downloads as nemotron_downloads,
+    nemotron_cc_download,
     nemotron_mix,
     nemotron_mix_block_shuffle,
     tokenize_nemotron,
@@ -119,7 +119,7 @@
     },
     "nemotron_cc": {
         "subsets": list(NEMOTRON_DATASETS.keys()),
-        "download": nemotron_downloads["nemotron_cc"],
+        "download": nemotron_cc_download,
         "tokenize_fn": tokenize_nemotron,
     },
     "dolma": {
diff --git a/experiments/pretraining_datasets/nemotron.py b/experiments/pretraining_datasets/nemotron.py
index 22fc4d1efa..b4211bd5aa 100644
--- a/experiments/pretraining_datasets/nemotron.py
+++ b/experiments/pretraining_datasets/nemotron.py
@@ -13,12 +13,9 @@
 from marin.processing.tokenize import TokenizeConfig, lm_mixture_data_config, tokenize
 from marin.processing.tokenize.data_configs import TokenizerStep
 
-# Raw dataset download step
-downloads = {
-    "nemotron_cc": download_nemotron_v1_step("raw/nemotro-cc").as_executor_step(),
-}
+nemotron_cc_download = download_nemotron_v1_step().as_executor_step()
 
-_nemotron_cc_path = output_path_of(downloads["nemotron_cc"], "contrib/Nemotron/Nemotron-CC/data-jsonl/")
+_nemotron_cc_path = output_path_of(nemotron_cc_download, "contrib/Nemotron/Nemotron-CC/data-jsonl/")
 
 NEMOTRON_DATASETS = {
     "hq_actual": ["quality=high/kind=actual/**/*.jsonl.*"],
diff --git a/experiments/train_test_overlap/train_test_total.py b/experiments/train_test_overlap/train_test_total.py
index e08dbfb4f2..92387dd61d 100644
--- a/experiments/train_test_overlap/train_test_total.py
+++ b/experiments/train_test_overlap/train_test_total.py
@@ -37,7 +37,7 @@
 from experiments.midtraining_datasets import finemath_3_plus
 from experiments.pretraining_datasets.simple import downloads
 from experiments.pretraining_datasets.dolmino import downloads as dolmino_downloads
-from experiments.pretraining_datasets.nemotron import downloads as nemotron_downloads
+from experiments.pretraining_datasets.nemotron import nemotron_cc_download
 from experiments.train_test_overlap.eval_datasets_overlap import EVAL_DATASET_STEPS
 
 logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
@@ -81,7 +81,7 @@ def run_train_test_overlap(config: DeconConfig) -> str:
     DatasetConfig(name="starcoder", path=downloads["starcoderdata"], text_field="content"),
     DatasetConfig(name="proofpile", path=downloads["proofpile_2"]),
     DatasetConfig(name="dolmino", path=dolmino_downloads["dolmino"]),
-    DatasetConfig(name="nemotron_cc", path=nemotron_downloads["nemotron_cc"]),
+    DatasetConfig(name="nemotron_cc", path=nemotron_cc_download),
 ]
 
 
diff --git a/lib/marin/src/marin/datakit/download/nemotron_v1.py b/lib/marin/src/marin/datakit/download/nemotron_v1.py
index 0befbf1883..27a267b38d 100644
--- a/lib/marin/src/marin/datakit/download/nemotron_v1.py
+++ b/lib/marin/src/marin/datakit/download/nemotron_v1.py
@@ -108,10 +108,12 @@ def download_nemotron_cc(output_path: str) -> None:
     logger.info(f"Downloaded Nemotron CC files to {output_path}")
 
 
-def download_nemotron_v1_step(name: str = "raw/nemotron-cc") -> StepSpec:
+def download_nemotron_v1_step() -> StepSpec:
     """Create a StepSpec that downloads the Nemotron-CC dataset from Common Crawl."""
 
     return StepSpec(
-        name=name,
+        name="raw/nemotron_v1",
         fn=lambda output_path: download_nemotron_cc(output_path=output_path),
+        # NOTE: use the existing output to avoid re-downloading. Yes this is mssing the `n`.
+        override_output_path="raw/nemotro-cc-eeb783",
     )

From debf1fe0e1821442d8fd103586c850a0a4c1e79c Mon Sep 17 00:00:00 2001
From: Rafal Wojdyla <ravwojdyla@gmail.com>
Date: Wed, 25 Mar 2026 16:09:46 -0700
Subject: [PATCH 39/56] Replace nemotron_cc_download global with a function

Inline _nemotron_cc_path into its only caller. All consumers now
call nemotron_cc_download() instead of referencing a global.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 experiments/pretraining_datasets/__init__.py       | 2 +-
 experiments/pretraining_datasets/nemotron.py       | 9 +++++----
 experiments/train_test_overlap/train_test_total.py | 2 +-
 3 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/experiments/pretraining_datasets/__init__.py b/experiments/pretraining_datasets/__init__.py
index 79d651252b..93e9ffddf0 100644
--- a/experiments/pretraining_datasets/__init__.py
+++ b/experiments/pretraining_datasets/__init__.py
@@ -119,7 +119,7 @@
     },
     "nemotron_cc": {
         "subsets": list(NEMOTRON_DATASETS.keys()),
-        "download": nemotron_cc_download,
+        "download": nemotron_cc_download(),
         "tokenize_fn": tokenize_nemotron,
     },
     "dolma": {
diff --git a/experiments/pretraining_datasets/nemotron.py b/experiments/pretraining_datasets/nemotron.py
index b4211bd5aa..d822c4dd5b 100644
--- a/experiments/pretraining_datasets/nemotron.py
+++ b/experiments/pretraining_datasets/nemotron.py
@@ -13,9 +13,10 @@
 from marin.processing.tokenize import TokenizeConfig, lm_mixture_data_config, tokenize
 from marin.processing.tokenize.data_configs import TokenizerStep
 
-nemotron_cc_download = download_nemotron_v1_step().as_executor_step()
 
-_nemotron_cc_path = output_path_of(nemotron_cc_download, "contrib/Nemotron/Nemotron-CC/data-jsonl/")
+def nemotron_cc_download() -> ExecutorStep:
+    return download_nemotron_v1_step().as_executor_step()
+
 
 NEMOTRON_DATASETS = {
     "hq_actual": ["quality=high/kind=actual/**/*.jsonl.*"],
@@ -52,8 +53,8 @@
 
 def _get_nemotron_split_paths(split: str):
     """Helper to get file paths for a nemotron split."""
-    patterns = NEMOTRON_DATASETS[split]
-    return [_nemotron_cc_path / pattern for pattern in patterns]
+    base = output_path_of(nemotron_cc_download(), "contrib/Nemotron/Nemotron-CC/data-jsonl/")
+    return [base / pattern for pattern in NEMOTRON_DATASETS[split]]
 
 
 def tokenize_nemotron(
diff --git a/experiments/train_test_overlap/train_test_total.py b/experiments/train_test_overlap/train_test_total.py
index 92387dd61d..af280c552b 100644
--- a/experiments/train_test_overlap/train_test_total.py
+++ b/experiments/train_test_overlap/train_test_total.py
@@ -81,7 +81,7 @@ def run_train_test_overlap(config: DeconConfig) -> str:
     DatasetConfig(name="starcoder", path=downloads["starcoderdata"], text_field="content"),
     DatasetConfig(name="proofpile", path=downloads["proofpile_2"]),
     DatasetConfig(name="dolmino", path=dolmino_downloads["dolmino"]),
-    DatasetConfig(name="nemotron_cc", path=nemotron_cc_download),
+    DatasetConfig(name="nemotron_cc", path=nemotron_cc_download()),
 ]
 
 

From 6ba73d5bdd57c6801b2d95d2c392226960c5e90a Mon Sep 17 00:00:00 2001
From: Rafal Wojdyla <ravwojdyla@gmail.com>
Date: Wed, 25 Mar 2026 16:10:29 -0700
Subject: [PATCH 40/56] Fix typo in nemotron_v1 comment

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 lib/marin/src/marin/datakit/download/nemotron_v1.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/marin/src/marin/datakit/download/nemotron_v1.py b/lib/marin/src/marin/datakit/download/nemotron_v1.py
index 27a267b38d..3a4f9a0a98 100644
--- a/lib/marin/src/marin/datakit/download/nemotron_v1.py
+++ b/lib/marin/src/marin/datakit/download/nemotron_v1.py
@@ -114,6 +114,6 @@ def download_nemotron_v1_step() -> StepSpec:
     return StepSpec(
         name="raw/nemotron_v1",
         fn=lambda output_path: download_nemotron_cc(output_path=output_path),
-        # NOTE: use the existing output to avoid re-downloading. Yes this is mssing the `n`.
+        # NOTE: use the existing output to avoid re-downloading. Yes this is missing the `n`.
         override_output_path="raw/nemotro-cc-eeb783",
     )

From 4b86e369fde6c8bf97ec533d857bf753667b2f61 Mon Sep 17 00:00:00 2001
From: Rafal Wojdyla <ravwojdyla@gmail.com>
Date: Wed, 25 Mar 2026 16:17:01 -0700
Subject: [PATCH 41/56] Rename huggingface.py to huggingface_utils.py, update
 all imports

Updates 18 files with import paths and mock targets to reflect
the rename from huggingface to huggingface_utils.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 experiments/common_pile/tokenize_common_pile.py        |  2 +-
 experiments/defaults.py                                |  2 +-
 experiments/eval_datasets.py                           |  2 +-
 experiments/midtraining_datasets.py                    |  2 +-
 experiments/models.py                                  |  2 +-
 .../download_and_tokenize_fineweb2_hq.py               |  2 +-
 experiments/paloma.py                                  |  2 +-
 experiments/posttrain/preference_datasets.py           |  2 +-
 experiments/pretraining_datasets/dolma.py              |  2 +-
 experiments/pretraining_datasets/dolmino.py            |  2 +-
 experiments/pretraining_datasets/simple.py             |  2 +-
 .../train_test_overlap/eval_datasets_overlap.py        |  2 +-
 experiments/two_stage/data.py                          |  2 +-
 .../download/{huggingface.py => huggingface_utils.py}  |  0
 lib/marin/src/marin/datakit/download/nemotron_v2.py    |  8 +++++++-
 .../marin/processing/tokenize/download_pretokenized.py |  2 +-
 lib/marin/src/marin/speedrun/paloma_local_download.py  |  4 ++--
 tests/datakit/download/test_huggingface.py             | 10 +++++-----
 tests/datakit/test_datakit.py                          |  2 +-
 tests/test_hfdataset_spec.py                           |  2 +-
 20 files changed, 30 insertions(+), 24 deletions(-)
 rename lib/marin/src/marin/datakit/download/{huggingface.py => huggingface_utils.py} (100%)

diff --git a/experiments/common_pile/tokenize_common_pile.py b/experiments/common_pile/tokenize_common_pile.py
index faee07fc76..1ec5b2f86a 100644
--- a/experiments/common_pile/tokenize_common_pile.py
+++ b/experiments/common_pile/tokenize_common_pile.py
@@ -5,7 +5,7 @@
 
 from experiments.defaults import default_tokenize
 from experiments.llama import llama3_tokenizer
-from marin.datakit.download.huggingface import DownloadConfig, download_hf
+from marin.datakit.download.huggingface_utils import DownloadConfig, download_hf
 from marin.execution.executor import ExecutorStep, executor_main, this_output_path
 from marin.processing.tokenize.data_configs import TokenizerStep, lm_mixture_data_config
 
diff --git a/experiments/defaults.py b/experiments/defaults.py
index 01e9583442..ef1e9ad892 100644
--- a/experiments/defaults.py
+++ b/experiments/defaults.py
@@ -46,7 +46,7 @@
 from experiments.simple_sft_config import SimpleSFTConfig
 from experiments.simple_train_config import SimpleTrainConfig
 from levanter.utils.mesh import MeshConfig
-from marin.datakit.download.huggingface import DownloadConfig, download_hf
+from marin.datakit.download.huggingface_utils import DownloadConfig, download_hf
 from marin.evaluation.evaluation_config import EvalTaskConfig
 from marin.execution.executor import (
     ExecutorStep,
diff --git a/experiments/eval_datasets.py b/experiments/eval_datasets.py
index f55df8b3fc..db6e8f8f54 100644
--- a/experiments/eval_datasets.py
+++ b/experiments/eval_datasets.py
@@ -3,7 +3,7 @@
 
 import dataclasses
 
-from marin.datakit.download.huggingface import DownloadConfig, download_hf
+from marin.datakit.download.huggingface_utils import DownloadConfig, download_hf
 from marin.execution.executor import ExecutorStep, executor_main, this_output_path, versioned
 from marin.transform.huggingface.dataset_to_eval import DatasetConversionConfig, OutputFormatOptions, hf_dataset_to_jsonl
 
diff --git a/experiments/midtraining_datasets.py b/experiments/midtraining_datasets.py
index 2706f8a4e9..b30e57dc67 100644
--- a/experiments/midtraining_datasets.py
+++ b/experiments/midtraining_datasets.py
@@ -4,7 +4,7 @@
 from experiments.common_pile.tokenize_common_pile import stackv2_edu_filtered
 from experiments.defaults import default_download, default_tokenize
 from experiments.llama import llama3_tokenizer
-from marin.datakit.download.huggingface import DownloadConfig, download_hf
+from marin.datakit.download.huggingface_utils import DownloadConfig, download_hf
 from marin.execution import versioned
 from marin.execution.executor import ExecutorStep, this_output_path
 from marin.processing.tokenize import lm_mixture_data_config
diff --git a/experiments/models.py b/experiments/models.py
index 972ca4f753..1afb7bb907 100644
--- a/experiments/models.py
+++ b/experiments/models.py
@@ -18,7 +18,7 @@
 
 from dataclasses import dataclass
 
-from marin.datakit.download.huggingface import DownloadConfig, download_hf
+from marin.datakit.download.huggingface_utils import DownloadConfig, download_hf
 from marin.execution.executor import ExecutorStep, this_output_path, versioned
 from marin.utils import get_directory_friendly_name
 
diff --git a/experiments/multilingual_fineweb2_hq/download_and_tokenize_fineweb2_hq.py b/experiments/multilingual_fineweb2_hq/download_and_tokenize_fineweb2_hq.py
index a3fd2ae82a..db4a6fbb4b 100644
--- a/experiments/multilingual_fineweb2_hq/download_and_tokenize_fineweb2_hq.py
+++ b/experiments/multilingual_fineweb2_hq/download_and_tokenize_fineweb2_hq.py
@@ -13,7 +13,7 @@
 
 from experiments.llama import llama3_tokenizer
 from experiments.multilingual_fineweb2_hq.constants import FINEWEB2_DATASETS
-from marin.datakit.download.huggingface import DownloadConfig, download_hf
+from marin.datakit.download.huggingface_utils import DownloadConfig, download_hf
 from marin.execution.executor import ExecutorStep, executor_main, output_path_of, this_output_path, versioned
 from marin.processing.tokenize import TokenizeConfig, tokenize
 from marin.processing.tokenize.data_configs import TokenizerStep
diff --git a/experiments/paloma.py b/experiments/paloma.py
index 24c1a536df..e354e31d54 100644
--- a/experiments/paloma.py
+++ b/experiments/paloma.py
@@ -9,7 +9,7 @@
 
 import os.path
 
-from marin.datakit.download.huggingface import DownloadConfig as HfDownloadConfig, download_hf
+from marin.datakit.download.huggingface_utils import DownloadConfig as HfDownloadConfig, download_hf
 
 # cyclic dependency
 # from experiments.llama import llama3_tokenizer
diff --git a/experiments/posttrain/preference_datasets.py b/experiments/posttrain/preference_datasets.py
index 105722d2af..9ea785000c 100644
--- a/experiments/posttrain/preference_datasets.py
+++ b/experiments/posttrain/preference_datasets.py
@@ -22,7 +22,7 @@
 from collections.abc import Sequence
 from dataclasses import dataclass, field
 
-from marin.datakit.download.huggingface import DownloadConfig, download_hf
+from marin.datakit.download.huggingface_utils import DownloadConfig, download_hf
 from marin.execution.executor import (
     ExecutorStep,
     executor_main,
diff --git a/experiments/pretraining_datasets/dolma.py b/experiments/pretraining_datasets/dolma.py
index 02b62df0aa..51604389c0 100644
--- a/experiments/pretraining_datasets/dolma.py
+++ b/experiments/pretraining_datasets/dolma.py
@@ -10,7 +10,7 @@
 
 import os.path
 
-from marin.datakit.download.huggingface import DownloadConfig, download_hf
+from marin.datakit.download.huggingface_utils import DownloadConfig, download_hf
 from marin.execution.executor import ExecutorStep, this_output_path, versioned, InputName
 from marin.processing.tokenize import TokenizeConfig, tokenize
 from marin.processing.tokenize.data_configs import TokenizerStep
diff --git a/experiments/pretraining_datasets/dolmino.py b/experiments/pretraining_datasets/dolmino.py
index 25dab84f52..0d8eb18a60 100644
--- a/experiments/pretraining_datasets/dolmino.py
+++ b/experiments/pretraining_datasets/dolmino.py
@@ -5,7 +5,7 @@
 
 import os.path
 
-from marin.datakit.download.huggingface import DownloadConfig, download_hf
+from marin.datakit.download.huggingface_utils import DownloadConfig, download_hf
 from marin.execution.executor import ExecutorStep, this_output_path, versioned
 from marin.processing.tokenize import TokenizeConfig, tokenize
 from marin.processing.tokenize.data_configs import TokenizerStep
diff --git a/experiments/pretraining_datasets/simple.py b/experiments/pretraining_datasets/simple.py
index 5fa9a5fa65..b4ab925bae 100644
--- a/experiments/pretraining_datasets/simple.py
+++ b/experiments/pretraining_datasets/simple.py
@@ -12,7 +12,7 @@
 
 from levanter.data.text import TextLmDatasetFormat
 from levanter.store.cache import CacheOptions
-from marin.datakit.download.huggingface import download_hf_step
+from marin.datakit.download.huggingface_utils import download_hf_step
 from marin.execution.executor import ExecutorStep, InputName, this_output_path, versioned
 from marin.processing.tokenize import TokenizeConfig, tokenize
 
diff --git a/experiments/train_test_overlap/eval_datasets_overlap.py b/experiments/train_test_overlap/eval_datasets_overlap.py
index b7df8679aa..f547aa3170 100644
--- a/experiments/train_test_overlap/eval_datasets_overlap.py
+++ b/experiments/train_test_overlap/eval_datasets_overlap.py
@@ -1,7 +1,7 @@
 # Copyright The Marin Authors
 # SPDX-License-Identifier: Apache-2.0
 
-from marin.datakit.download.huggingface import DownloadConfig, download_hf
+from marin.datakit.download.huggingface_utils import DownloadConfig, download_hf
 from marin.execution.executor import ExecutorStep, executor_main, this_output_path, versioned
 from marin.transform.huggingface.dataset_to_eval import DatasetConversionConfig, OutputFormatOptions, hf_dataset_to_jsonl
 
diff --git a/experiments/two_stage/data.py b/experiments/two_stage/data.py
index c78daf0ab1..3493638097 100644
--- a/experiments/two_stage/data.py
+++ b/experiments/two_stage/data.py
@@ -6,7 +6,7 @@
 from experiments.midtraining_datasets import finemath_3_plus_tokenized
 from experiments.pretraining_datasets import tokenize_dolma
 from experiments.pretraining_datasets.simple import tokenized
-from marin.datakit.download.huggingface import DownloadConfig, download_hf
+from marin.datakit.download.huggingface_utils import DownloadConfig, download_hf
 from marin.execution.executor import ExecutorStep, this_output_path
 
 dolma_components = tokenize_dolma()
diff --git a/lib/marin/src/marin/datakit/download/huggingface.py b/lib/marin/src/marin/datakit/download/huggingface_utils.py
similarity index 100%
rename from lib/marin/src/marin/datakit/download/huggingface.py
rename to lib/marin/src/marin/datakit/download/huggingface_utils.py
diff --git a/lib/marin/src/marin/datakit/download/nemotron_v2.py b/lib/marin/src/marin/datakit/download/nemotron_v2.py
index 4c31f81ffa..24f074a92a 100644
--- a/lib/marin/src/marin/datakit/download/nemotron_v2.py
+++ b/lib/marin/src/marin/datakit/download/nemotron_v2.py
@@ -12,7 +12,7 @@
 
 from dataclasses import dataclass, field
 
-from marin.datakit.download.huggingface import download_hf_step
+from marin.datakit.download.huggingface_utils import download_hf_step
 from marin.execution.step_spec import StepSpec
 
 
@@ -24,6 +24,7 @@ class NemotronV2Dataset:
     revision: str
     subsets: dict[str, str] = field(default_factory=dict)
     """Maps subset_name -> glob pattern for parquet files within the download."""
+    override_output_path: str | None = None
 
 
 NEMOTRON_V2_DATASETS: dict[str, NemotronV2Dataset] = {
@@ -38,6 +39,7 @@ class NemotronV2Dataset:
             "medium_quality": "Medium-Quality/**/*.parquet",
             "translated_diverse_qa": "Translated-Diverse-QA/**/*.parquet",
         },
+        override_output_path="raw/nemotron_cc_v2-674913",
     ),
     "nemotron_cc_v2_1": NemotronV2Dataset(
         hf_dataset_id="nvidia/Nemotron-CC-v2.1",
@@ -53,11 +55,13 @@ class NemotronV2Dataset:
             "medium_high_quality_translated": "Medium-High-Quality-Translated-To-English/**/*.parquet",
             "medium_quality": "Medium-Quality/**/*.parquet",
         },
+        override_output_path="raw/nemotron_cc_v2_1-a7afb6",
     ),
     "nemotron_cc_code_v1": NemotronV2Dataset(
         hf_dataset_id="nvidia/Nemotron-CC-Code-v1",
         revision="5c5bebc",
         subsets={"all": "data/**/*.parquet"},
+        override_output_path="raw/nemotron_cc_code_v1-c55cd9",
     ),
     "nemotron_cc_math_v1": NemotronV2Dataset(
         hf_dataset_id="nvidia/Nemotron-CC-Math-v1",
@@ -67,6 +71,7 @@ class NemotronV2Dataset:
             "4plus": "4plus/**/*.parquet",
             "4plus_mind": "4plus_MIND/**/*.parquet",
         },
+        override_output_path="nemotron_cc_math_v1-322fe4",
     ),
     "nemotron_pretraining_code_v1": NemotronV2Dataset(
         hf_dataset_id="nvidia/Nemotron-Pretraining-Code-v1",
@@ -75,6 +80,7 @@ class NemotronV2Dataset:
             "synthetic_code": "Synthetic-Code/**/*.parquet",
             "code_metadata": "Nemotron-Code-Metadata/**/*.parquet",
         },
+        override_output_path="raw/nemotron_pretraining_code_v1-175b37",
     ),
     "nemotron_pretraining_code_v2": NemotronV2Dataset(
         hf_dataset_id="nvidia/Nemotron-Pretraining-Code-v2",
diff --git a/lib/marin/src/marin/processing/tokenize/download_pretokenized.py b/lib/marin/src/marin/processing/tokenize/download_pretokenized.py
index cab2433bec..f7a30d4c25 100644
--- a/lib/marin/src/marin/processing/tokenize/download_pretokenized.py
+++ b/lib/marin/src/marin/processing/tokenize/download_pretokenized.py
@@ -18,7 +18,7 @@
 )
 from levanter.store.cache import CacheOptions
 
-from marin.datakit.download.huggingface import (
+from marin.datakit.download.huggingface_utils import (
     DownloadConfig as HfDownloadConfig,
     download_hf as hf_download_logic,
 )
diff --git a/lib/marin/src/marin/speedrun/paloma_local_download.py b/lib/marin/src/marin/speedrun/paloma_local_download.py
index e2ee68f766..dd0031481d 100644
--- a/lib/marin/src/marin/speedrun/paloma_local_download.py
+++ b/lib/marin/src/marin/speedrun/paloma_local_download.py
@@ -8,8 +8,8 @@
 """
 
 from experiments.paloma import paloma_tokenized
-from marin.datakit.download.huggingface import DownloadConfig as HfDownloadConfig
-from marin.datakit.download.huggingface import download_hf
+from marin.datakit.download.huggingface_utils import DownloadConfig as HfDownloadConfig
+from marin.datakit.download.huggingface_utils import download_hf
 from marin.execution.executor import ExecutorStep, executor_main, this_output_path, versioned
 
 llama3_tokenizer = "meta-llama/Meta-Llama-3.1-8B"
diff --git a/tests/datakit/download/test_huggingface.py b/tests/datakit/download/test_huggingface.py
index f055cc94ca..4626bd498f 100644
--- a/tests/datakit/download/test_huggingface.py
+++ b/tests/datakit/download/test_huggingface.py
@@ -9,7 +9,7 @@
 
 import pytest
 
-from marin.datakit.download.huggingface import (
+from marin.datakit.download.huggingface_utils import (
     DownloadConfig,
     _relative_path_in_source,
     download_hf,
@@ -76,7 +76,7 @@ def test_download_hf_basic(mock_hf_fs, tmp_path):
     )
 
     # Mock HfFileSystem creation
-    with patch("marin.datakit.download.huggingface.HfFileSystem", return_value=hf_fs):
+    with patch("marin.datakit.download.huggingface_utils.HfFileSystem", return_value=hf_fs):
         download_hf(cfg)
 
     # Verify files were downloaded
@@ -118,7 +118,7 @@ def test_download_hf_appends_sha_when_configured(mock_hf_fs, tmp_path):
         append_sha_to_path=True,
     )
 
-    with patch("marin.datakit.download.huggingface.HfFileSystem", return_value=hf_fs):
+    with patch("marin.datakit.download.huggingface_utils.HfFileSystem", return_value=hf_fs):
         download_hf(cfg)
 
     target_output = base_output_path / revision
@@ -180,8 +180,8 @@ def read(self, chunk_size):
     hf_fs.open.side_effect = lambda path, mode="rb", **_kwargs: FlakyReader()
 
     with (
-        patch("marin.datakit.download.huggingface.HfFileSystem", return_value=hf_fs),
-        patch("marin.datakit.download.huggingface.time.sleep", return_value=None),
+        patch("marin.datakit.download.huggingface_utils.HfFileSystem", return_value=hf_fs),
+        patch("marin.datakit.download.huggingface_utils.time.sleep", return_value=None),
     ):
         result = stream_file_to_fsspec(
             str(output_path),
diff --git a/tests/datakit/test_datakit.py b/tests/datakit/test_datakit.py
index 8a9286763b..d2b3578020 100644
--- a/tests/datakit/test_datakit.py
+++ b/tests/datakit/test_datakit.py
@@ -9,7 +9,7 @@
 import pytest
 from levanter.store.cache import CacheLedger, TreeCache
 
-from marin.datakit.download.huggingface import download_hf_step
+from marin.datakit.download.huggingface_utils import download_hf_step
 from marin.execution.step_runner import StepRunner
 from marin.execution.step_spec import StepSpec
 from marin.processing.tokenize.tokenize import TokenizeConfig, tokenize
diff --git a/tests/test_hfdataset_spec.py b/tests/test_hfdataset_spec.py
index 14ad782471..ef6d2bd264 100644
--- a/tests/test_hfdataset_spec.py
+++ b/tests/test_hfdataset_spec.py
@@ -2,7 +2,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from experiments.defaults import default_download, default_tokenize
-from marin.datakit.download.huggingface import DownloadConfig
+from marin.datakit.download.huggingface_utils import DownloadConfig
 from marin.processing.tokenize import HfDatasetSpec
 from marin.processing.tokenize.tokenize import HfTokenizeConfig, TokenizeConfig
 

From e2ac4dec41b2880b8a8d7b8f0f502e16a0cfbf8c Mon Sep 17 00:00:00 2001
From: Rafal Wojdyla <ravwojdyla@gmail.com>
Date: Wed, 25 Mar 2026 16:18:05 -0700
Subject: [PATCH 42/56] Add override_output_path to nemotron_v2 datasets

Wire override_output_path through NemotronV2Dataset to
download_nemotron_v2_step. Fix missing raw/ prefix on
nemotron_cc_math_v1. Add overrides for code_v2, specialized_v1,
and sft_v1 to pin existing output paths.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 lib/marin/src/marin/datakit/download/nemotron_v2.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/lib/marin/src/marin/datakit/download/nemotron_v2.py b/lib/marin/src/marin/datakit/download/nemotron_v2.py
index 24f074a92a..0e845bd2cd 100644
--- a/lib/marin/src/marin/datakit/download/nemotron_v2.py
+++ b/lib/marin/src/marin/datakit/download/nemotron_v2.py
@@ -71,7 +71,7 @@ class NemotronV2Dataset:
             "4plus": "4plus/**/*.parquet",
             "4plus_mind": "4plus_MIND/**/*.parquet",
         },
-        override_output_path="nemotron_cc_math_v1-322fe4",
+        override_output_path="raw/nemotron_cc_math_v1-322fe4",
     ),
     "nemotron_pretraining_code_v1": NemotronV2Dataset(
         hf_dataset_id="nvidia/Nemotron-Pretraining-Code-v1",
@@ -93,6 +93,7 @@ class NemotronV2Dataset:
             "synthetic_rewriting": "Synthetic-Rewriting/**/*.parquet",
             "synthetic_transpilation": "Synthetic-Transpilation/**/*.parquet",
         },
+        override_output_path="raw/nemotron_pretraining_code_v2-d15a24",
     ),
     "nemotron_pretraining_specialized_v1": NemotronV2Dataset(
         hf_dataset_id="nvidia/Nemotron-Pretraining-Specialized-v1",
@@ -105,6 +106,7 @@ class NemotronV2Dataset:
             "rqa": "Nemotron-Pretraining-RQA/**/*.parquet",
             "infinibyte_reasoning": "Nemotron-Pretraining-InfiniByte-Reasoning/**/*.parquet",
         },
+        override_output_path="raw/nemotron_pretraining_specialized_v1-a31fae",
     ),
     "nemotron_pretraining_sft_v1": NemotronV2Dataset(
         hf_dataset_id="nvidia/Nemotron-Pretraining-SFT-v1",
@@ -114,6 +116,7 @@ class NemotronV2Dataset:
             "sft_general": "Nemotron-SFT-General/**/*.parquet",
             "sft_math": "Nemotron-SFT-MATH/**/*.parquet",
         },
+        override_output_path="raw/nemotron_pretraining_sft_v1-10f77e",
     ),
 }
 
@@ -125,4 +128,5 @@ def download_nemotron_v2_step(family: str) -> StepSpec:
         f"raw/{family}",
         hf_dataset_id=info.hf_dataset_id,
         revision=info.revision,
+        override_output_path=info.override_output_path,
     )

From 7c2ab1e9e6a73aa4b9198636c14f86508801600c Mon Sep 17 00:00:00 2001
From: Rafal Wojdyla <ravwojdyla@gmail.com>
Date: Wed, 25 Mar 2026 16:24:01 -0700
Subject: [PATCH 43/56] Revert huggingface_utils.py rename back to
 huggingface.py
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The _utils suffix was misleading — this is the core HF download
module, not a utility helper. Reverts all 19 import paths.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 experiments/common_pile/tokenize_common_pile.py        |  2 +-
 experiments/defaults.py                                |  2 +-
 experiments/eval_datasets.py                           |  2 +-
 experiments/midtraining_datasets.py                    |  2 +-
 experiments/models.py                                  |  2 +-
 .../download_and_tokenize_fineweb2_hq.py               |  2 +-
 experiments/paloma.py                                  |  2 +-
 experiments/posttrain/preference_datasets.py           |  2 +-
 experiments/pretraining_datasets/dolma.py              |  2 +-
 experiments/pretraining_datasets/dolmino.py            |  2 +-
 experiments/pretraining_datasets/simple.py             |  2 +-
 .../train_test_overlap/eval_datasets_overlap.py        |  2 +-
 experiments/two_stage/data.py                          |  2 +-
 .../download/{huggingface_utils.py => huggingface.py}  |  0
 lib/marin/src/marin/datakit/download/nemotron_v2.py    |  3 ++-
 .../marin/processing/tokenize/download_pretokenized.py |  2 +-
 lib/marin/src/marin/speedrun/paloma_local_download.py  |  4 ++--
 tests/datakit/download/test_huggingface.py             | 10 +++++-----
 tests/datakit/test_datakit.py                          |  2 +-
 tests/test_hfdataset_spec.py                           |  2 +-
 20 files changed, 25 insertions(+), 24 deletions(-)
 rename lib/marin/src/marin/datakit/download/{huggingface_utils.py => huggingface.py} (100%)

diff --git a/experiments/common_pile/tokenize_common_pile.py b/experiments/common_pile/tokenize_common_pile.py
index 1ec5b2f86a..faee07fc76 100644
--- a/experiments/common_pile/tokenize_common_pile.py
+++ b/experiments/common_pile/tokenize_common_pile.py
@@ -5,7 +5,7 @@
 
 from experiments.defaults import default_tokenize
 from experiments.llama import llama3_tokenizer
-from marin.datakit.download.huggingface_utils import DownloadConfig, download_hf
+from marin.datakit.download.huggingface import DownloadConfig, download_hf
 from marin.execution.executor import ExecutorStep, executor_main, this_output_path
 from marin.processing.tokenize.data_configs import TokenizerStep, lm_mixture_data_config
 
diff --git a/experiments/defaults.py b/experiments/defaults.py
index ef1e9ad892..01e9583442 100644
--- a/experiments/defaults.py
+++ b/experiments/defaults.py
@@ -46,7 +46,7 @@
 from experiments.simple_sft_config import SimpleSFTConfig
 from experiments.simple_train_config import SimpleTrainConfig
 from levanter.utils.mesh import MeshConfig
-from marin.datakit.download.huggingface_utils import DownloadConfig, download_hf
+from marin.datakit.download.huggingface import DownloadConfig, download_hf
 from marin.evaluation.evaluation_config import EvalTaskConfig
 from marin.execution.executor import (
     ExecutorStep,
diff --git a/experiments/eval_datasets.py b/experiments/eval_datasets.py
index db6e8f8f54..f55df8b3fc 100644
--- a/experiments/eval_datasets.py
+++ b/experiments/eval_datasets.py
@@ -3,7 +3,7 @@
 
 import dataclasses
 
-from marin.datakit.download.huggingface_utils import DownloadConfig, download_hf
+from marin.datakit.download.huggingface import DownloadConfig, download_hf
 from marin.execution.executor import ExecutorStep, executor_main, this_output_path, versioned
 from marin.transform.huggingface.dataset_to_eval import DatasetConversionConfig, OutputFormatOptions, hf_dataset_to_jsonl
 
diff --git a/experiments/midtraining_datasets.py b/experiments/midtraining_datasets.py
index b30e57dc67..2706f8a4e9 100644
--- a/experiments/midtraining_datasets.py
+++ b/experiments/midtraining_datasets.py
@@ -4,7 +4,7 @@
 from experiments.common_pile.tokenize_common_pile import stackv2_edu_filtered
 from experiments.defaults import default_download, default_tokenize
 from experiments.llama import llama3_tokenizer
-from marin.datakit.download.huggingface_utils import DownloadConfig, download_hf
+from marin.datakit.download.huggingface import DownloadConfig, download_hf
 from marin.execution import versioned
 from marin.execution.executor import ExecutorStep, this_output_path
 from marin.processing.tokenize import lm_mixture_data_config
diff --git a/experiments/models.py b/experiments/models.py
index 1afb7bb907..972ca4f753 100644
--- a/experiments/models.py
+++ b/experiments/models.py
@@ -18,7 +18,7 @@
 
 from dataclasses import dataclass
 
-from marin.datakit.download.huggingface_utils import DownloadConfig, download_hf
+from marin.datakit.download.huggingface import DownloadConfig, download_hf
 from marin.execution.executor import ExecutorStep, this_output_path, versioned
 from marin.utils import get_directory_friendly_name
 
diff --git a/experiments/multilingual_fineweb2_hq/download_and_tokenize_fineweb2_hq.py b/experiments/multilingual_fineweb2_hq/download_and_tokenize_fineweb2_hq.py
index db4a6fbb4b..a3fd2ae82a 100644
--- a/experiments/multilingual_fineweb2_hq/download_and_tokenize_fineweb2_hq.py
+++ b/experiments/multilingual_fineweb2_hq/download_and_tokenize_fineweb2_hq.py
@@ -13,7 +13,7 @@
 
 from experiments.llama import llama3_tokenizer
 from experiments.multilingual_fineweb2_hq.constants import FINEWEB2_DATASETS
-from marin.datakit.download.huggingface_utils import DownloadConfig, download_hf
+from marin.datakit.download.huggingface import DownloadConfig, download_hf
 from marin.execution.executor import ExecutorStep, executor_main, output_path_of, this_output_path, versioned
 from marin.processing.tokenize import TokenizeConfig, tokenize
 from marin.processing.tokenize.data_configs import TokenizerStep
diff --git a/experiments/paloma.py b/experiments/paloma.py
index e354e31d54..24c1a536df 100644
--- a/experiments/paloma.py
+++ b/experiments/paloma.py
@@ -9,7 +9,7 @@
 
 import os.path
 
-from marin.datakit.download.huggingface_utils import DownloadConfig as HfDownloadConfig, download_hf
+from marin.datakit.download.huggingface import DownloadConfig as HfDownloadConfig, download_hf
 
 # cyclic dependency
 # from experiments.llama import llama3_tokenizer
diff --git a/experiments/posttrain/preference_datasets.py b/experiments/posttrain/preference_datasets.py
index 9ea785000c..105722d2af 100644
--- a/experiments/posttrain/preference_datasets.py
+++ b/experiments/posttrain/preference_datasets.py
@@ -22,7 +22,7 @@
 from collections.abc import Sequence
 from dataclasses import dataclass, field
 
-from marin.datakit.download.huggingface_utils import DownloadConfig, download_hf
+from marin.datakit.download.huggingface import DownloadConfig, download_hf
 from marin.execution.executor import (
     ExecutorStep,
     executor_main,
diff --git a/experiments/pretraining_datasets/dolma.py b/experiments/pretraining_datasets/dolma.py
index 51604389c0..02b62df0aa 100644
--- a/experiments/pretraining_datasets/dolma.py
+++ b/experiments/pretraining_datasets/dolma.py
@@ -10,7 +10,7 @@
 
 import os.path
 
-from marin.datakit.download.huggingface_utils import DownloadConfig, download_hf
+from marin.datakit.download.huggingface import DownloadConfig, download_hf
 from marin.execution.executor import ExecutorStep, this_output_path, versioned, InputName
 from marin.processing.tokenize import TokenizeConfig, tokenize
 from marin.processing.tokenize.data_configs import TokenizerStep
diff --git a/experiments/pretraining_datasets/dolmino.py b/experiments/pretraining_datasets/dolmino.py
index 0d8eb18a60..25dab84f52 100644
--- a/experiments/pretraining_datasets/dolmino.py
+++ b/experiments/pretraining_datasets/dolmino.py
@@ -5,7 +5,7 @@
 
 import os.path
 
-from marin.datakit.download.huggingface_utils import DownloadConfig, download_hf
+from marin.datakit.download.huggingface import DownloadConfig, download_hf
 from marin.execution.executor import ExecutorStep, this_output_path, versioned
 from marin.processing.tokenize import TokenizeConfig, tokenize
 from marin.processing.tokenize.data_configs import TokenizerStep
diff --git a/experiments/pretraining_datasets/simple.py b/experiments/pretraining_datasets/simple.py
index b4ab925bae..5fa9a5fa65 100644
--- a/experiments/pretraining_datasets/simple.py
+++ b/experiments/pretraining_datasets/simple.py
@@ -12,7 +12,7 @@
 
 from levanter.data.text import TextLmDatasetFormat
 from levanter.store.cache import CacheOptions
-from marin.datakit.download.huggingface_utils import download_hf_step
+from marin.datakit.download.huggingface import download_hf_step
 from marin.execution.executor import ExecutorStep, InputName, this_output_path, versioned
 from marin.processing.tokenize import TokenizeConfig, tokenize
 
diff --git a/experiments/train_test_overlap/eval_datasets_overlap.py b/experiments/train_test_overlap/eval_datasets_overlap.py
index f547aa3170..b7df8679aa 100644
--- a/experiments/train_test_overlap/eval_datasets_overlap.py
+++ b/experiments/train_test_overlap/eval_datasets_overlap.py
@@ -1,7 +1,7 @@
 # Copyright The Marin Authors
 # SPDX-License-Identifier: Apache-2.0
 
-from marin.datakit.download.huggingface_utils import DownloadConfig, download_hf
+from marin.datakit.download.huggingface import DownloadConfig, download_hf
 from marin.execution.executor import ExecutorStep, executor_main, this_output_path, versioned
 from marin.transform.huggingface.dataset_to_eval import DatasetConversionConfig, OutputFormatOptions, hf_dataset_to_jsonl
 
diff --git a/experiments/two_stage/data.py b/experiments/two_stage/data.py
index 3493638097..c78daf0ab1 100644
--- a/experiments/two_stage/data.py
+++ b/experiments/two_stage/data.py
@@ -6,7 +6,7 @@
 from experiments.midtraining_datasets import finemath_3_plus_tokenized
 from experiments.pretraining_datasets import tokenize_dolma
 from experiments.pretraining_datasets.simple import tokenized
-from marin.datakit.download.huggingface_utils import DownloadConfig, download_hf
+from marin.datakit.download.huggingface import DownloadConfig, download_hf
 from marin.execution.executor import ExecutorStep, this_output_path
 
 dolma_components = tokenize_dolma()
diff --git a/lib/marin/src/marin/datakit/download/huggingface_utils.py b/lib/marin/src/marin/datakit/download/huggingface.py
similarity index 100%
rename from lib/marin/src/marin/datakit/download/huggingface_utils.py
rename to lib/marin/src/marin/datakit/download/huggingface.py
diff --git a/lib/marin/src/marin/datakit/download/nemotron_v2.py b/lib/marin/src/marin/datakit/download/nemotron_v2.py
index 0e845bd2cd..91b644730b 100644
--- a/lib/marin/src/marin/datakit/download/nemotron_v2.py
+++ b/lib/marin/src/marin/datakit/download/nemotron_v2.py
@@ -12,7 +12,7 @@
 
 from dataclasses import dataclass, field
 
-from marin.datakit.download.huggingface_utils import download_hf_step
+from marin.datakit.download.huggingface import download_hf_step
 from marin.execution.step_spec import StepSpec
 
 
@@ -25,6 +25,7 @@ class NemotronV2Dataset:
     subsets: dict[str, str] = field(default_factory=dict)
     """Maps subset_name -> glob pattern for parquet files within the download."""
     override_output_path: str | None = None
+    """Allow to point at existing download output to avoid re-downloading"""
 
 
 NEMOTRON_V2_DATASETS: dict[str, NemotronV2Dataset] = {
diff --git a/lib/marin/src/marin/processing/tokenize/download_pretokenized.py b/lib/marin/src/marin/processing/tokenize/download_pretokenized.py
index f7a30d4c25..cab2433bec 100644
--- a/lib/marin/src/marin/processing/tokenize/download_pretokenized.py
+++ b/lib/marin/src/marin/processing/tokenize/download_pretokenized.py
@@ -18,7 +18,7 @@
 )
 from levanter.store.cache import CacheOptions
 
-from marin.datakit.download.huggingface_utils import (
+from marin.datakit.download.huggingface import (
     DownloadConfig as HfDownloadConfig,
     download_hf as hf_download_logic,
 )
diff --git a/lib/marin/src/marin/speedrun/paloma_local_download.py b/lib/marin/src/marin/speedrun/paloma_local_download.py
index dd0031481d..e2ee68f766 100644
--- a/lib/marin/src/marin/speedrun/paloma_local_download.py
+++ b/lib/marin/src/marin/speedrun/paloma_local_download.py
@@ -8,8 +8,8 @@
 """
 
 from experiments.paloma import paloma_tokenized
-from marin.datakit.download.huggingface_utils import DownloadConfig as HfDownloadConfig
-from marin.datakit.download.huggingface_utils import download_hf
+from marin.datakit.download.huggingface import DownloadConfig as HfDownloadConfig
+from marin.datakit.download.huggingface import download_hf
 from marin.execution.executor import ExecutorStep, executor_main, this_output_path, versioned
 
 llama3_tokenizer = "meta-llama/Meta-Llama-3.1-8B"
diff --git a/tests/datakit/download/test_huggingface.py b/tests/datakit/download/test_huggingface.py
index 4626bd498f..f055cc94ca 100644
--- a/tests/datakit/download/test_huggingface.py
+++ b/tests/datakit/download/test_huggingface.py
@@ -9,7 +9,7 @@
 
 import pytest
 
-from marin.datakit.download.huggingface_utils import (
+from marin.datakit.download.huggingface import (
     DownloadConfig,
     _relative_path_in_source,
     download_hf,
@@ -76,7 +76,7 @@ def test_download_hf_basic(mock_hf_fs, tmp_path):
     )
 
     # Mock HfFileSystem creation
-    with patch("marin.datakit.download.huggingface_utils.HfFileSystem", return_value=hf_fs):
+    with patch("marin.datakit.download.huggingface.HfFileSystem", return_value=hf_fs):
         download_hf(cfg)
 
     # Verify files were downloaded
@@ -118,7 +118,7 @@ def test_download_hf_appends_sha_when_configured(mock_hf_fs, tmp_path):
         append_sha_to_path=True,
     )
 
-    with patch("marin.datakit.download.huggingface_utils.HfFileSystem", return_value=hf_fs):
+    with patch("marin.datakit.download.huggingface.HfFileSystem", return_value=hf_fs):
         download_hf(cfg)
 
     target_output = base_output_path / revision
@@ -180,8 +180,8 @@ def read(self, chunk_size):
     hf_fs.open.side_effect = lambda path, mode="rb", **_kwargs: FlakyReader()
 
     with (
-        patch("marin.datakit.download.huggingface_utils.HfFileSystem", return_value=hf_fs),
-        patch("marin.datakit.download.huggingface_utils.time.sleep", return_value=None),
+        patch("marin.datakit.download.huggingface.HfFileSystem", return_value=hf_fs),
+        patch("marin.datakit.download.huggingface.time.sleep", return_value=None),
     ):
         result = stream_file_to_fsspec(
             str(output_path),
diff --git a/tests/datakit/test_datakit.py b/tests/datakit/test_datakit.py
index d2b3578020..8a9286763b 100644
--- a/tests/datakit/test_datakit.py
+++ b/tests/datakit/test_datakit.py
@@ -9,7 +9,7 @@
 import pytest
 from levanter.store.cache import CacheLedger, TreeCache
 
-from marin.datakit.download.huggingface_utils import download_hf_step
+from marin.datakit.download.huggingface import download_hf_step
 from marin.execution.step_runner import StepRunner
 from marin.execution.step_spec import StepSpec
 from marin.processing.tokenize.tokenize import TokenizeConfig, tokenize
diff --git a/tests/test_hfdataset_spec.py b/tests/test_hfdataset_spec.py
index ef6d2bd264..14ad782471 100644
--- a/tests/test_hfdataset_spec.py
+++ b/tests/test_hfdataset_spec.py
@@ -2,7 +2,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from experiments.defaults import default_download, default_tokenize
-from marin.datakit.download.huggingface_utils import DownloadConfig
+from marin.datakit.download.huggingface import DownloadConfig
 from marin.processing.tokenize import HfDatasetSpec
 from marin.processing.tokenize.tokenize import HfTokenizeConfig, TokenizeConfig
 

From 63de1bf4e3fe8216abb7a44abb99c0492290395a Mon Sep 17 00:00:00 2001
From: Rafal Wojdyla <ravwojdyla@gmail.com>
Date: Wed, 25 Mar 2026 16:35:04 -0700
Subject: [PATCH 44/56] Delete unused dclm_hq download and transform modules
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

datakit/download/dclm_hq.py (CC HTML fetcher) and
transform/dolmino/transform_dclm_hq.py (HTML→text converter) have
zero experiment consumers. Removes their test as well. The DCLM
mixture config in experiments/pretraining_datasets/dclm.py is
unrelated and kept.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .../src/marin/datakit/download/dclm_hq.py     | 194 -----------------
 .../transform/dolmino/transform_dclm_hq.py    | 156 --------------
 tests/datakit/download/test_dclm_hq.py        | 196 ------------------
 3 files changed, 546 deletions(-)
 delete mode 100644 lib/marin/src/marin/datakit/download/dclm_hq.py
 delete mode 100644 lib/marin/src/marin/transform/dolmino/transform_dclm_hq.py
 delete mode 100644 tests/datakit/download/test_dclm_hq.py

diff --git a/lib/marin/src/marin/datakit/download/dclm_hq.py b/lib/marin/src/marin/datakit/download/dclm_hq.py
deleted file mode 100644
index b473768ec0..0000000000
--- a/lib/marin/src/marin/datakit/download/dclm_hq.py
+++ /dev/null
@@ -1,194 +0,0 @@
-# Copyright The Marin Authors
-# SPDX-License-Identifier: Apache-2.0
-
-"""
-Download DCLM HQ HTML data by fetching HTML content from Common Crawl.
-
-Processes DCLM HQ JSONL files and enriches them with HTML content fetched from Common Crawl
-via a custom index server. Uses zephyr for parallel processing with flattened parallelism.
-
-Example Usage:
-uv run zephyr --backend=ray --max-parallelism=800 --memory=2GB \
-    lib/marin/src/marin/download/dclm_hq/download_dclm_hq_html.py \
-    --input_path gs://marin-us-central2/raw/dclm-baseline-1.0-parquet/global/ \
-    --output_path gs://marin-data/processed/dclm-hq-html/
-"""
-
-import io
-import json
-import logging
-import os
-import re
-from dataclasses import dataclass
-
-import requests
-from iris.marin_fs import open_url
-import warcio
-from marin.utils import fsspec_glob
-from tqdm import tqdm
-from zephyr import Dataset, ZephyrContext
-from zephyr.writers import ensure_parent_dir
-
-CC_IDX_HOST_URL = "http://34.72.201.218:8080"
-logger = logging.getLogger(__name__)
-
-
-@dataclass
-class FileTask:
-    """Represents a single file processing task."""
-
-    input_file_path: str
-    output_file_path: str
-
-
-def fetch_warc_from_cc(s3_warc_path: str, length: int, offset: int) -> str:
-    """
-    Fetch a WARC record from Common Crawl S3 bucket using byte range requests we get
-    from the CC index via `find_html_in_cc`.
-    Args:
-        s3_warc_path: Path to WARC file in S3 bucket
-        length: Length of the record in bytes
-        offset: Byte offset of the record in the WARC file
-    Returns:
-        The WARC record content as a string
-    """
-    # Convert string values to integers
-    offset = int(offset)
-    length = int(length)
-
-    # Make range request to CommonCrawl
-    response = requests.get(
-        f"https://data.commoncrawl.org/{s3_warc_path}", headers={"Range": f"bytes={offset}-{offset + length - 1}"}
-    )
-    response.raise_for_status()
-
-    # Parse WARC record and extract HTML content
-    with io.BytesIO(response.content) as stream:
-        for record in warcio.ArchiveIterator(stream):
-            content = record.content_stream().read()
-            return content.decode(errors="ignore")
-
-    raise ValueError(f"No WARC records found in response from {s3_warc_path}")
-
-
-def find_html_in_cc(split_id: str, target_uri: str) -> str | None:
-    """
-    We host our own index of the Common Crawl over GCP which we use in this function.
-    For each call we receive a list of chunks that contain the HTML content for the given target URI.
-    We then fetch each chunk and concatenate them together to form the complete HTML content.
-    Args:
-        split_id: The split ID of the Common Crawl
-        target_uri: The target URI to find the HTML content for
-    Returns:
-        The HTML content as a string
-    """
-    resp = requests.get(f"{CC_IDX_HOST_URL}/{split_id}-index?url={target_uri}&output=json")
-
-    resp.raise_for_status()
-
-    chunks = [json.loads(chunk) for chunk in resp.text.split("\n") if chunk]
-    sorted_chunks = sorted(chunks, key=lambda x: x["offset"])
-
-    html_content = ""
-
-    for chunk in sorted_chunks:
-        warc_path = chunk["filename"]
-        length = chunk["length"]
-        offset = chunk["offset"]
-
-        warc_record = fetch_warc_from_cc(warc_path, length, offset)
-
-        html_content += warc_record
-
-    return html_content
-
-
-def process_file(task: FileTask) -> None:
-    """Process a single DCLM file, fetching HTML from Common Crawl.
-
-    Args:
-        task: FileTask containing input and output file paths
-    """
-    logger.info(f"Starting processing of file {task.input_file_path}")
-    logger.info(f"Source: {task.input_file_path}")
-    logger.info(f"Destination: {task.output_file_path}")
-    try:
-        ensure_parent_dir(task.output_file_path)
-        with (
-            open_url(task.input_file_path, compression="zstd") as source,
-            open_url(task.output_file_path, "wt", compression="gzip") as output,
-        ):
-            text_wrapper = io.TextIOWrapper(source, encoding="utf-8")
-
-            for line in tqdm(text_wrapper, desc="Processing lines"):
-                row = json.loads(line.strip())
-
-                # We need to extract the split from where the record was for querying the index
-                # The only place we have this information is in the warcinfo key in DCLM HQ
-                # The format is:
-                # warc-type: WARC/1.1
-                # ...
-                # isPartOf: CC-MAIN-2024-01
-                # This however is a string and not a key-value pair, so we need to extract
-                # the split from it via regex pattern `isPartOf:\s*(CC-MAIN-\d{4}-\d{2})`.
-                # This pattern groups the value of the key `isPartOf` that is of the form
-                # `CC-MAIN-xxxx-xx` where `xxxx` is a year and `xx` is a month.
-                match = re.search(r"isPartOf:\s*(CC-MAIN-\d{4}-\d{2})", row["metadata"]["warcinfo"])
-                if match is None:
-                    logger.error(f"No split found for record ID: {row['metadata']['WARC-Record-ID']}")
-                    continue
-
-                is_part_of = match.group(1)
-
-                try:
-                    html_string = find_html_in_cc(is_part_of, row["metadata"]["WARC-Target-URI"])
-
-                    if html_string is None:
-                        logger.error(f"No HTML found for record ID: {row['metadata']['WARC-Record-ID']}")
-                        continue
-
-                    if "text" in row:
-                        row.pop("text")
-
-                    row["html"] = html_string
-
-                    print(json.dumps(row), file=output)
-                except Exception as e:
-                    logger.exception(f"Error processing line: {e}")
-                    continue
-
-        logger.info("\nProcessing completed successfully!")
-        logger.info(f"File available at: {task.output_file_path}")
-
-    except Exception as e:
-        logger.error(f"Error during processing: {e}")
-        raise
-
-
-def extract_dclm_hq_dump(input_path: str, output_path: str) -> None:
-    """Process the DCLM HQ dump and enrich with HTML from Common Crawl."""
-    logger.info(f"Starting processing of DCLM HQ dump in {input_path}")
-
-    all_files = []
-    paths = [i.split("/")[-1] for i in fsspec_glob(os.path.join(input_path, "*"))]
-
-    logger.info(f"Found {len(paths)} shards to process")
-
-    for path in paths:
-        shard_input = os.path.join(input_path, path)
-        shard_paths = fsspec_glob(os.path.join(shard_input, "*.json.zst"))
-
-        for shard_path in shard_paths:
-            output_file_path = os.path.join(output_path, path, os.path.basename(shard_path)).replace(
-                ".json.zst", ".jsonl.gz"
-            )
-            all_files.append(FileTask(input_file_path=shard_path, output_file_path=output_file_path))
-
-    logger.info(f"Found {len(all_files)} files to process")
-
-    pipeline = Dataset.from_list(all_files).map(process_file)
-
-    ctx = ZephyrContext(name="download-dclm-html")
-    ctx.execute(pipeline)
-
-    logger.info("Processing completed successfully!")
diff --git a/lib/marin/src/marin/transform/dolmino/transform_dclm_hq.py b/lib/marin/src/marin/transform/dolmino/transform_dclm_hq.py
deleted file mode 100644
index 42f04264bf..0000000000
--- a/lib/marin/src/marin/transform/dolmino/transform_dclm_hq.py
+++ /dev/null
@@ -1,156 +0,0 @@
-# Copyright The Marin Authors
-# SPDX-License-Identifier: Apache-2.0
-
-"""
-marin/transform/dolmino/transform_dclm_hq.py
-
-Performs HTML->Text/MD conversion using the specified tools over a DCLM HQ dump save in DOLMA format.
-
-Example Usage (production, large dataset):
-uv run zephyr --backend=ray --max-parallelism=200 --memory=2GB \
-    lib/marin/src/marin/transform/dolmino/transform_dclm_hq.py \
-    --entry-point=process_dclm_hq_dump \
-    --input_hf_path "hf://datasets/allenai/dolmino-mix-1124@main/data/dclm" \
-    --output_path gs://bucket/processed/dclm-hq \
-    --extract_method resiliparse \
-    --extract_config.type resiliparse \
-    --hf_repo_id "allenai/dolmino-mix-1124" \
-    --hf_revision "main" \
-    --hf_paths '["data/dclm"]'
-
-Example Usage (local testing, small dataset):
-uv run zephyr --backend=threadpool --max-parallelism=2 --entry-point=process_dclm_hq_dump \
-    lib/marin/src/marin/transform/dolmino/transform_dclm_hq.py \
-    --input_hf_path "hf://datasets/allenai/dolmino-mix-1124@main/data/dclm" \
-    --output_path /tmp/dclm_hq_test \
-    --extract_method trafilatura \
-    --extract_config.type trafilatura \
-    --extract_config.favor_precision false \
-    --extract_config.favor_recall true \
-    --hf_repo_id "allenai/dolmino-mix-1124" \
-    --hf_revision "main" \
-    --hf_paths '["data/dclm"]' \
-    --max_split 1
-"""
-
-import json
-import logging
-import os
-from dataclasses import dataclass
-
-import draccus
-from iris.marin_fs import open_url, url_to_fs
-from marin.datakit.download.dclm_hq import find_html_in_cc
-from huggingface_hub import HfFileSystem
-from marin.schemas.web.convert import ExtractionConfig
-from marin.web.convert import convert_page
-from tqdm import tqdm
-from zephyr import Dataset, ZephyrContext
-from zephyr.writers import atomic_rename
-
-logger = logging.getLogger(__name__)
-
-
-@dataclass
-class DCLMHQExtractionConfig:
-    input_hf_path: str
-    output_path: str
-    extract_method: str
-    extract_config: ExtractionConfig
-    hf_repo_id: str
-    hf_revision: str
-    hf_paths: list[str]
-    max_split: int | None = None
-
-
-def process_file(
-    input_file_path: str,
-    output_file_path: str,
-    extract_method: str,
-    extract_config: ExtractionConfig,
-) -> None:
-    logger.info(f"Starting processing of file {input_file_path}")
-    logger.info(f"Source: {input_file_path}")
-    logger.info(f"Destination: {output_file_path}")
-
-    with atomic_rename(output_file_path) as temp_path:
-        with (
-            open_url(input_file_path, "rt", compression="zstd") as source,
-            open_url(temp_path, "wt", compression="gzip") as output,
-        ):
-            for line in tqdm(source, desc="Processing lines"):
-                row = json.loads(line)
-
-                try:
-                    html_string = find_html_in_cc(row["metadata"]["WARC-Record-ID"], row["metadata"]["WARC-Target-URI"])
-
-                    if html_string is None:
-                        logger.error(f"No HTML found for record ID: {row['metadata']['WARC-Record-ID']}")
-                        continue
-
-                    content = convert_page(html_string, extract_method=extract_method, config=extract_config)["content"]
-
-                    if content is None:
-                        continue
-
-                    out_dict = {
-                        "id": row["id"],
-                        "source": row["source"],
-                        "metadata": row["metadata"],
-                        "text": content,
-                    }
-
-                    print(json.dumps(out_dict), file=output)  # Without this line, the JSON file will be corrupted
-                except Exception as e:
-                    logger.exception(f"Error processing line: {e}")
-                    continue
-
-    logger.info("\nProcessing completed successfully!")
-    logger.info(f"File available at: {output_file_path}")
-
-
-@draccus.wrap()
-def process_dclm_hq_dump(cfg: DCLMHQExtractionConfig) -> None:
-    logger.info(f"Starting processing of DCLM HQ dump in {cfg.input_hf_path}")
-
-    # Glob all files across all shards upfront
-    all_files = []
-    hf_fs = HfFileSystem()
-    paths = [i.split("/")[-1] for i in hf_fs.ls(cfg.input_hf_path, detail=False)]
-    paths = paths[: cfg.max_split] if cfg.max_split else paths
-
-    logger.info(f"Found {len(paths)} shards to process")
-
-    for path in paths:
-        input_path = os.path.join(cfg.input_hf_path, path)
-        shard_paths = [i.split("/")[-1] for i in hf_fs.glob(os.path.join(input_path, "*.json.zst"))]
-
-        for shard_path in shard_paths:
-            input_file_path = os.path.join(input_path, shard_path)
-            output_file_path = os.path.join(cfg.output_path, path, shard_path).replace(".json.zst", ".jsonl.gz")
-            all_files.append(
-                {
-                    "input": input_file_path,
-                    "output": output_file_path,
-                    "extract_method": cfg.extract_method,
-                    "extract_config": cfg.extract_config,
-                }
-            )
-
-    logger.info(f"Total files to process: {len(all_files)}")
-
-    pipeline = (
-        Dataset.from_list(all_files)
-        .filter(lambda f: not url_to_fs(f["output"])[0].exists(f["output"]))
-        .map(
-            lambda f: process_file(
-                input_file_path=f["input"],
-                output_file_path=f["output"],
-                extract_method=f["extract_method"],
-                extract_config=f["extract_config"],
-            )
-        )
-    )
-
-    ctx = ZephyrContext(name="transform-dclm-hq")
-    ctx.execute(pipeline)
diff --git a/tests/datakit/download/test_dclm_hq.py b/tests/datakit/download/test_dclm_hq.py
deleted file mode 100644
index c83b5e03fe..0000000000
--- a/tests/datakit/download/test_dclm_hq.py
+++ /dev/null
@@ -1,196 +0,0 @@
-# Copyright The Marin Authors
-# SPDX-License-Identifier: Apache-2.0
-
-"""Tests for DCLM HQ download script that fetches HTML from Common Crawl."""
-
-import json
-from unittest.mock import patch
-
-import zstandard as zstd
-from marin.datakit.download.dclm_hq import extract_dclm_hq_dump
-
-SAMPLE_DCLM_RECORDS = [
-    {
-        "id": "test-doc-001",
-        "source": "common-crawl",
-        "text": "This is the original text that should be removed.",
-        "metadata": {
-            "WARC-Record-ID": "<urn:uuid:12345678-1234-1234-1234-123456789abc>",
-            "WARC-Target-URI": "http://example.com/test-page",
-            "warcinfo": (
-                "warc-type: WARC/1.1\nWARC-Date: 2024-01-15T10:30:00Z\nisPartOf: CC-MAIN-2024-01\ndescription: Test WARC"
-            ),
-        },
-    },
-    {
-        "id": "test-doc-002",
-        "source": "common-crawl",
-        "text": "This is another original text.",
-        "metadata": {
-            "WARC-Record-ID": "<urn:uuid:87654321-4321-4321-4321-cba987654321>",
-            "WARC-Target-URI": "http://example.com/another-page",
-            "warcinfo": (
-                "warc-type: WARC/1.1\nWARC-Date: 2024-01-15T11:30:00Z\nisPartOf: CC-MAIN-2024-01\ndescription: Test WARC"
-            ),
-        },
-    },
-    {
-        "id": "test-doc-003",
-        "source": "common-crawl",
-        "text": "Third document text.",
-        "metadata": {
-            "WARC-Record-ID": "<urn:uuid:11111111-2222-3333-4444-555555555555>",
-            "WARC-Target-URI": "http://example.com/third-page",
-            "warcinfo": (
-                "warc-type: WARC/1.1\nWARC-Date: 2024-02-10T09:00:00Z\nisPartOf: CC-MAIN-2024-10\ndescription: Test WARC"
-            ),
-        },
-    },
-]
-
-SAMPLE_WARC_HTML = {
-    "http://example.com/test-page": (
-        """<!DOCTYPE html>
-<html>
-<head><title>Test Page</title></head>
-<body>
-<h1>Test Article</h1>
-<p>This is test content from Common Crawl.</p>
-</body>
-</html>"""
-    ),
-    "http://example.com/another-page": (
-        """<!DOCTYPE html>
-<html>
-<head><title>Another Page</title></head>
-<body>
-<h1>Another Article</h1>
-<p>Different content here.</p>
-</body>
-</html>"""
-    ),
-    "http://example.com/third-page": (
-        """<!DOCTYPE html>
-<html>
-<head><title>Third Page</title></head>
-<body>
-<h1>Third Article</h1>
-<p>More content.</p>
-</body>
-</html>"""
-    ),
-}
-
-
-def create_warc_bytes(html_content: str) -> bytes:
-    """Create minimal WARC record bytes for testing."""
-    http_response = (
-        "HTTP/1.1 200 OK\r\n"
-        "Content-Type: text/html\r\n"
-        f"Content-Length: {len(html_content.encode())}\r\n"
-        "\r\n"
-        f"{html_content}"
-    )
-
-    warc_header = (
-        "WARC/1.0\r\n"
-        "WARC-Type: response\r\n"
-        "WARC-Record-ID: <urn:uuid:test-record>\r\n"
-        "WARC-Target-URI: http://example.com/test\r\n"
-        "Content-Type: application/http; msgtype=response\r\n"
-        f"Content-Length: {len(http_response.encode())}\r\n"
-        "\r\n"
-    )
-
-    full_warc = warc_header + http_response + "\r\n\r\n"
-    return full_warc.encode()
-
-
-def create_zstd_compressed_jsonl(records: list[dict]) -> bytes:
-    """Create zstd compressed JSONL content."""
-    jsonl_content = "\n".join(json.dumps(record) for record in records) + "\n"
-    jsonl_bytes = jsonl_content.encode("utf-8")
-    cctx = zstd.ZstdCompressor()
-    return cctx.compress(jsonl_bytes)
-
-
-def test_extract_dclm_hq_pipeline(tmp_path, read_all_jsonl_gz):
-    """Test full DCLM HQ download pipeline with zephyr integration."""
-    output_dir = tmp_path / "output"
-    output_dir.mkdir()
-
-    # Create input files in nested structure
-    shard1_dir = tmp_path / "input" / "shard1"
-    shard2_dir = tmp_path / "input" / "shard2"
-    shard1_dir.mkdir(parents=True)
-    shard2_dir.mkdir(parents=True)
-
-    file1_data = create_zstd_compressed_jsonl([SAMPLE_DCLM_RECORDS[0]])
-    file2_data = create_zstd_compressed_jsonl(SAMPLE_DCLM_RECORDS[1:])
-
-    file1_path = shard1_dir / "file1.json.zst"
-    file2_path = shard2_dir / "file2.json.zst"
-
-    file1_path.write_bytes(file1_data)
-    file2_path.write_bytes(file2_data)
-
-    def mock_requests_get(url, **kwargs):
-        from unittest.mock import Mock
-
-        # Mock CC index server responses
-        if "CC-MAIN-2024-01-index" in url:
-            response = Mock()
-            response.status_code = 200
-            if "test-page" in url:
-                response.text = json.dumps({"filename": "test.warc.gz", "offset": "0", "length": "1000"})
-            else:  # another-page
-                response.text = json.dumps({"filename": "test2.warc.gz", "offset": "0", "length": "1000"})
-            response.raise_for_status = Mock()
-            return response
-        elif "CC-MAIN-2024-10-index" in url:
-            response = Mock()
-            response.status_code = 200
-            response.text = json.dumps({"filename": "test3.warc.gz", "offset": "0", "length": "1000"})
-            response.raise_for_status = Mock()
-            return response
-        # Mock Common Crawl WARC fetches
-        elif "data.commoncrawl.org" in url:
-            response = Mock()
-            response.status_code = 200
-            # Determine which HTML to return based on the WARC file
-            if "test.warc.gz" in url:
-                html_content = SAMPLE_WARC_HTML["http://example.com/test-page"]
-            elif "test2.warc.gz" in url:
-                html_content = SAMPLE_WARC_HTML["http://example.com/another-page"]
-            else:  # test3.warc.gz
-                html_content = SAMPLE_WARC_HTML["http://example.com/third-page"]
-            response.content = create_warc_bytes(html_content)
-            response.raise_for_status = Mock()
-            return response
-
-        raise ValueError(f"Unexpected URL: {url}")
-
-    with patch("marin.datakit.download.dclm_hq.requests.get", side_effect=mock_requests_get):
-        extract_dclm_hq_dump(str(tmp_path / "input"), str(output_dir))
-
-    # Verify output files were created in nested structure
-    shard1_output = output_dir / "shard1"
-    shard2_output = output_dir / "shard2"
-
-    assert shard1_output.exists()
-    assert shard2_output.exists()
-
-    # Read all records
-    all_records = []
-    all_records.extend(read_all_jsonl_gz(shard1_output, "*.jsonl.gz"))
-    all_records.extend(read_all_jsonl_gz(shard2_output, "*.jsonl.gz"))
-
-    assert len(all_records) == 3
-
-    # Verify records have HTML and no text
-    for record in all_records:
-        assert "id" in record
-        assert "html" in record
-        assert "text" not in record
-        assert "metadata" in record
-        assert len(record["html"]) > 0

From 38f64741f8c0d7d837040a73b73db99856d7f3e1 Mon Sep 17 00:00:00 2001
From: Rafal Wojdyla <ravwojdyla@gmail.com>
Date: Wed, 25 Mar 2026 16:57:41 -0700
Subject: [PATCH 45/56] Extract dolmino download into
 datakit/download/dolmino.py

Moves the download definition and DOLMINO_DATASETS split metadata
into a datakit module. The experiment file now imports from there
and only handles tokenization wiring.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 experiments/pretraining_datasets/dolmino.py   | 40 +++----------------
 .../src/marin/datakit/download/dolmino.py     | 32 +++++++++++++++
 2 files changed, 37 insertions(+), 35 deletions(-)
 create mode 100644 lib/marin/src/marin/datakit/download/dolmino.py

diff --git a/experiments/pretraining_datasets/dolmino.py b/experiments/pretraining_datasets/dolmino.py
index 25dab84f52..8126ded701 100644
--- a/experiments/pretraining_datasets/dolmino.py
+++ b/experiments/pretraining_datasets/dolmino.py
@@ -5,46 +5,16 @@
 
 import os.path
 
-from marin.datakit.download.huggingface import DownloadConfig, download_hf
+from marin.datakit.download.dolmino import DOLMINO_DATASETS, download_dolmino_step
 from marin.execution.executor import ExecutorStep, this_output_path, versioned
 from marin.processing.tokenize import TokenizeConfig, tokenize
 from marin.processing.tokenize.data_configs import TokenizerStep
 
-# Raw dataset download step
-downloads = {
-    "dolmino": (
-        ExecutorStep(
-            name="raw/dolmino-mix-1124",
-            fn=download_hf,
-            config=DownloadConfig(
-                hf_dataset_id="allenai/dolmino-mix-1124",
-                revision="bb54cab",
-                gcs_output_path=this_output_path(),
-                wait_for_completion=True,
-            ),
-        )
-        .with_output_path("raw/dolmino-mix-1124-157960")
-        .cd("bb54cab")
-    )
-}
+_dolmino_download = download_dolmino_step().as_executor_step()
+_dolmino_base_dir = _dolmino_download.cd("bb54cab").cd("data")
 
-_dolmino_base_dir = downloads["dolmino"].cd("data")
-
-# The following dataset splits define file patterns for each split.
-DOLMINO_DATASETS = {
-    "dclm": ["**/*.json.zst"],
-    "flan": ["**/*.json.gz"],
-    "math/codesearchnet-owmfilter": ["**/*.jsonl.gz"],
-    "math/dolmino_math_synth": ["**/*.jsonl"],
-    "math/gsm8k": ["**/*.jsonl.zst"],
-    "math/mathcoder2-synthmath": ["**/*.jsonl"],
-    "math/metamath-owmfilter": ["**/*.jsonl.gz"],
-    "math/tinyGSM-MIND": ["**/*.jsonl.gz"],
-    "math/tulu_math": ["**/*.jsonl"],
-    "pes2o": ["**/*.json.gz"],
-    "stackexchange": ["**/*.json.gz"],
-    "wiki": ["**/*.json.gz"],
-}
+# Backward compat — some consumers import this
+downloads = {"dolmino": _dolmino_download.cd("bb54cab")}
 
 # NB: we changed how hashes were computed for this corpus and we'd like to avoid recomputing them
 DOLMINO_LLAMA3_OVERRIDES = {
diff --git a/lib/marin/src/marin/datakit/download/dolmino.py b/lib/marin/src/marin/datakit/download/dolmino.py
new file mode 100644
index 0000000000..0e1b063cf2
--- /dev/null
+++ b/lib/marin/src/marin/datakit/download/dolmino.py
@@ -0,0 +1,32 @@
+# Copyright The Marin Authors
+# SPDX-License-Identifier: Apache-2.0
+
+"""Dolmino dataset download definition and split metadata."""
+
+from marin.datakit.download.huggingface import download_hf_step
+from marin.execution.step_spec import StepSpec
+
+DOLMINO_DATASETS = {
+    "dclm": ["**/*.json.zst"],
+    "flan": ["**/*.json.gz"],
+    "math/codesearchnet-owmfilter": ["**/*.jsonl.gz"],
+    "math/dolmino_math_synth": ["**/*.jsonl"],
+    "math/gsm8k": ["**/*.jsonl.zst"],
+    "math/mathcoder2-synthmath": ["**/*.jsonl"],
+    "math/metamath-owmfilter": ["**/*.jsonl.gz"],
+    "math/tinyGSM-MIND": ["**/*.jsonl.gz"],
+    "math/tulu_math": ["**/*.jsonl"],
+    "pes2o": ["**/*.json.gz"],
+    "stackexchange": ["**/*.json.gz"],
+    "wiki": ["**/*.json.gz"],
+}
+
+
+def download_dolmino_step() -> StepSpec:
+    """Download the dolmino-mix-1124 dataset from HuggingFace."""
+    return download_hf_step(
+        "raw/dolmino-mix-1124",
+        hf_dataset_id="allenai/dolmino-mix-1124",
+        revision="bb54cab",
+        override_output_path="raw/dolmino-mix-1124-157960",
+    )

From 83a01f804f1102ea9237c188c1bdaaabc4585d0a Mon Sep 17 00:00:00 2001
From: Rafal Wojdyla <ravwojdyla@gmail.com>
Date: Wed, 25 Mar 2026 17:20:01 -0700
Subject: [PATCH 46/56] Extract dolma download into datakit/download/dolma.py

Moves download_dolma_step(), DOLMA_DATASETS, and
DOLMA_OLMO_MIXTURE_WEIGHTS into a datakit module. The experiment
file now imports from there and only handles tokenization wiring.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 experiments/pretraining_datasets/__init__.py  |  3 +-
 experiments/pretraining_datasets/dolma.py     | 73 ++-----------------
 lib/marin/src/marin/datakit/download/dolma.py | 60 +++++++++++++++
 3 files changed, 67 insertions(+), 69 deletions(-)
 create mode 100644 lib/marin/src/marin/datakit/download/dolma.py

diff --git a/experiments/pretraining_datasets/__init__.py b/experiments/pretraining_datasets/__init__.py
index 93e9ffddf0..571c4483cc 100644
--- a/experiments/pretraining_datasets/__init__.py
+++ b/experiments/pretraining_datasets/__init__.py
@@ -19,12 +19,11 @@
 
 # Import downloads and tokenized dicts from each module
 from experiments.pretraining_datasets.dolma import (
-    DOLMA_DATASETS,
     DOLMA_LLAMA3_OVERRIDES,
-    DOLMA_OLMO_MIXTURE_WEIGHTS,
     downloads as dolma_downloads,
     tokenize_dolma,
 )
+from marin.datakit.download.dolma import DOLMA_DATASETS, DOLMA_OLMO_MIXTURE_WEIGHTS
 from experiments.pretraining_datasets.dolmino import (
     DOLMINO_DATASETS,
     DOLMINO_LLAMA3_OVERRIDES,
diff --git a/experiments/pretraining_datasets/dolma.py b/experiments/pretraining_datasets/dolma.py
index 02b62df0aa..97d840d79d 100644
--- a/experiments/pretraining_datasets/dolma.py
+++ b/experiments/pretraining_datasets/dolma.py
@@ -1,83 +1,23 @@
 # Copyright The Marin Authors
 # SPDX-License-Identifier: Apache-2.0
 
-"""
-DOLMA 1.7 dataset definitions and tokenization.
-
-This module defines the raw DOLMA dataset download and tokenization
-logic for all 15 splits.
-"""
+"""DOLMA 1.7 dataset definitions and tokenization."""
 
 import os.path
 
-from marin.datakit.download.huggingface import DownloadConfig, download_hf
-from marin.execution.executor import ExecutorStep, this_output_path, versioned, InputName
+from marin.datakit.download.dolma import DOLMA_DATASETS, download_dolma_step
+from marin.execution.executor import ExecutorStep, InputName, this_output_path, versioned
 from marin.processing.tokenize import TokenizeConfig, tokenize
 from marin.processing.tokenize.data_configs import TokenizerStep
 
-# Raw dataset download step
-downloads = {
-    "dolma": ExecutorStep(
-        name="raw/dolma",
-        fn=download_hf,
-        config=DownloadConfig(
-            hf_dataset_id="allenai/dolma",
-            revision="7f48140",
-            gcs_output_path=this_output_path(),
-            wait_for_completion=True,
-        ),
-        override_output_path="raw/dolma",
-    )
-}
+_dolma_download = download_dolma_step().as_executor_step()
 
+# Backward compat — some consumers import this
+downloads = {"dolma": _dolma_download}
 
 # For dolma 1.7, we hardcode the path since it was added before versioning
 _DOLMA_V1_7_PATH = InputName.hardcoded("raw/dolma/v1.7")
 
-
-# Sampling proportion comes from https://huggingface.co/datasets/allenai/dolma
-DOLMA_OLMO_MIXTURE_WEIGHTS = {
-    "dolma/algebraic-stack": 12.6,  # 12.6 * 1.0
-    "dolma/arxiv": 28.0,  # 28.0 * 1.0
-    "dolma/gutenberg": 5.3,  # 5.3 * 1.0
-    "dolma/c4": 124.95,  # 249.9 * 0.5
-    "dolma/cc": 597.75,  # 1,195.5 * 0.5
-    "dolma/cc-news": 14.3,  # 1.0
-    "dolma/falcon": 456.4,  # 1.0, refined web
-    "dolma/megawika": 4.6,  # 1.0
-    "dolma/open-web-math": 12.6,  # 1.0
-    "dolma/pes2o": 57.2,  # 1.0
-    "dolma/reddit": 79.9,  # 1.0
-    "dolma/stackexchange": 19.6,  # 1.0
-    "dolma/starcoder": 263.8,  # 1.0
-    "dolma/flan": 16.5,  # 6.5 * 1.0
-    "dolma/wiki": 7.4,  # 3.7 * 2.0
-}
-
-DOLMA_DATASETS = {
-    "algebraic-stack": ["algebraic-stack-train-{0000..0015}.json.gz"],
-    "arxiv": ["arxiv-{0000..0099}.json.gz"],
-    "gutenberg": ["books-{0000..0002}.json.gz"],
-    "c4": ["c4-{0000..0170}.json.gz"],
-    "cc": [
-        "cc_en_head-{0000..0274}.json.gz",
-        "cc_en_middle-{0000..0238}.json.gz",
-        "cc_en_middle-{0240..0379}.json.gz",
-        "cc_en_tail-{0000..0152}.json.gz",
-        "cc_en_tail-{0154..0444}.json.gz",
-    ],
-    "cc-news": ["cc_news_head-{0000..0004}.json.gz", "cc_news_middle-{0000..0002}.json.gz", "cc_news_tail-0000.json.gz"],
-    "falcon": ["falcon-{0000..0499}.json.gz"],
-    "megawika": ["megawika-{0000..0261}.json.gz"],
-    "open-web-math": ["open-web-math-train-{0000..0012}.json.gz"],
-    "pes2o": ["pes2o-{0000..0025}.json.gz"],
-    "reddit": ["reddit-{0000..0077}.json.gz"],
-    "stackexchange": ["stackexchange-{0000..0025}.json.gz"],
-    "starcoder": ["starcoder-{0000..0048}.json.gz"],
-    "flan": ["tulu_flan-{0000..0065}.json.gz"],
-    "wiki": ["wiki-{0000..0001}.json.gz"],
-}
-
 # NB: we changed how hashes were computed for this corpus and we'd like to avoid recomputing them
 DOLMA_LLAMA3_OVERRIDES = {
     "c4": "tokenized/dolma/c4-e0e5ec",
@@ -118,7 +58,6 @@ def tokenize_dolma(*, tokenizer: str | None = None) -> dict[str, TokenizerStep]:
             ),
         )
 
-        # Check if we need to use override path for llama3
         if tokenizer == llama3_tokenizer and dataset in DOLMA_LLAMA3_OVERRIDES:
             step = step.with_output_path(DOLMA_LLAMA3_OVERRIDES[dataset])
         dolma_steps[os.path.join("dolma", dataset)] = step
diff --git a/lib/marin/src/marin/datakit/download/dolma.py b/lib/marin/src/marin/datakit/download/dolma.py
new file mode 100644
index 0000000000..b6849d4354
--- /dev/null
+++ b/lib/marin/src/marin/datakit/download/dolma.py
@@ -0,0 +1,60 @@
+# Copyright The Marin Authors
+# SPDX-License-Identifier: Apache-2.0
+
+"""Dolma 1.7 dataset download definition and split metadata."""
+
+from marin.datakit.download.huggingface import download_hf_step
+from marin.execution.step_spec import StepSpec
+
+DOLMA_DATASETS = {
+    "algebraic-stack": ["algebraic-stack-train-{0000..0015}.json.gz"],
+    "arxiv": ["arxiv-{0000..0099}.json.gz"],
+    "gutenberg": ["books-{0000..0002}.json.gz"],
+    "c4": ["c4-{0000..0170}.json.gz"],
+    "cc": [
+        "cc_en_head-{0000..0274}.json.gz",
+        "cc_en_middle-{0000..0238}.json.gz",
+        "cc_en_middle-{0240..0379}.json.gz",
+        "cc_en_tail-{0000..0152}.json.gz",
+        "cc_en_tail-{0154..0444}.json.gz",
+    ],
+    "cc-news": ["cc_news_head-{0000..0004}.json.gz", "cc_news_middle-{0000..0002}.json.gz", "cc_news_tail-0000.json.gz"],
+    "falcon": ["falcon-{0000..0499}.json.gz"],
+    "megawika": ["megawika-{0000..0261}.json.gz"],
+    "open-web-math": ["open-web-math-train-{0000..0012}.json.gz"],
+    "pes2o": ["pes2o-{0000..0025}.json.gz"],
+    "reddit": ["reddit-{0000..0077}.json.gz"],
+    "stackexchange": ["stackexchange-{0000..0025}.json.gz"],
+    "starcoder": ["starcoder-{0000..0048}.json.gz"],
+    "flan": ["tulu_flan-{0000..0065}.json.gz"],
+    "wiki": ["wiki-{0000..0001}.json.gz"],
+}
+
+# Sampling proportion comes from https://huggingface.co/datasets/allenai/dolma
+DOLMA_OLMO_MIXTURE_WEIGHTS = {
+    "dolma/algebraic-stack": 12.6,
+    "dolma/arxiv": 28.0,
+    "dolma/gutenberg": 5.3,
+    "dolma/c4": 124.95,
+    "dolma/cc": 597.75,
+    "dolma/cc-news": 14.3,
+    "dolma/falcon": 456.4,
+    "dolma/megawika": 4.6,
+    "dolma/open-web-math": 12.6,
+    "dolma/pes2o": 57.2,
+    "dolma/reddit": 79.9,
+    "dolma/stackexchange": 19.6,
+    "dolma/starcoder": 263.8,
+    "dolma/flan": 16.5,
+    "dolma/wiki": 7.4,
+}
+
+
+def download_dolma_step() -> StepSpec:
+    """Download the Dolma 1.7 dataset from HuggingFace."""
+    return download_hf_step(
+        "raw/dolma",
+        hf_dataset_id="allenai/dolma",
+        revision="7f48140",
+        override_output_path="raw/dolma",
+    )

From 76b3ae86b50f18771907cd20b16e500c0cecd620 Mon Sep 17 00:00:00 2001
From: Rafal Wojdyla <ravwojdyla@gmail.com>
Date: Wed, 25 Mar 2026 17:25:08 -0700
Subject: [PATCH 47/56] Move DOLMA_OLMO_MIXTURE_WEIGHTS back to experiment file

Mixture weights are experiment config, not download metadata.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 experiments/pretraining_datasets/__init__.py  |  3 ++-
 experiments/pretraining_datasets/dolma.py     | 19 +++++++++++++++++++
 lib/marin/src/marin/datakit/download/dolma.py | 19 -------------------
 3 files changed, 21 insertions(+), 20 deletions(-)

diff --git a/experiments/pretraining_datasets/__init__.py b/experiments/pretraining_datasets/__init__.py
index 571c4483cc..2e0f6cc004 100644
--- a/experiments/pretraining_datasets/__init__.py
+++ b/experiments/pretraining_datasets/__init__.py
@@ -20,10 +20,11 @@
 # Import downloads and tokenized dicts from each module
 from experiments.pretraining_datasets.dolma import (
     DOLMA_LLAMA3_OVERRIDES,
+    DOLMA_OLMO_MIXTURE_WEIGHTS,
     downloads as dolma_downloads,
     tokenize_dolma,
 )
-from marin.datakit.download.dolma import DOLMA_DATASETS, DOLMA_OLMO_MIXTURE_WEIGHTS
+from marin.datakit.download.dolma import DOLMA_DATASETS
 from experiments.pretraining_datasets.dolmino import (
     DOLMINO_DATASETS,
     DOLMINO_LLAMA3_OVERRIDES,
diff --git a/experiments/pretraining_datasets/dolma.py b/experiments/pretraining_datasets/dolma.py
index 97d840d79d..256ea0b58e 100644
--- a/experiments/pretraining_datasets/dolma.py
+++ b/experiments/pretraining_datasets/dolma.py
@@ -15,6 +15,25 @@
 # Backward compat — some consumers import this
 downloads = {"dolma": _dolma_download}
 
+# Sampling proportion comes from https://huggingface.co/datasets/allenai/dolma
+DOLMA_OLMO_MIXTURE_WEIGHTS = {
+    "dolma/algebraic-stack": 12.6,
+    "dolma/arxiv": 28.0,
+    "dolma/gutenberg": 5.3,
+    "dolma/c4": 124.95,
+    "dolma/cc": 597.75,
+    "dolma/cc-news": 14.3,
+    "dolma/falcon": 456.4,
+    "dolma/megawika": 4.6,
+    "dolma/open-web-math": 12.6,
+    "dolma/pes2o": 57.2,
+    "dolma/reddit": 79.9,
+    "dolma/stackexchange": 19.6,
+    "dolma/starcoder": 263.8,
+    "dolma/flan": 16.5,
+    "dolma/wiki": 7.4,
+}
+
 # For dolma 1.7, we hardcode the path since it was added before versioning
 _DOLMA_V1_7_PATH = InputName.hardcoded("raw/dolma/v1.7")
 
diff --git a/lib/marin/src/marin/datakit/download/dolma.py b/lib/marin/src/marin/datakit/download/dolma.py
index b6849d4354..7e9ac26e3b 100644
--- a/lib/marin/src/marin/datakit/download/dolma.py
+++ b/lib/marin/src/marin/datakit/download/dolma.py
@@ -30,25 +30,6 @@
     "wiki": ["wiki-{0000..0001}.json.gz"],
 }
 
-# Sampling proportion comes from https://huggingface.co/datasets/allenai/dolma
-DOLMA_OLMO_MIXTURE_WEIGHTS = {
-    "dolma/algebraic-stack": 12.6,
-    "dolma/arxiv": 28.0,
-    "dolma/gutenberg": 5.3,
-    "dolma/c4": 124.95,
-    "dolma/cc": 597.75,
-    "dolma/cc-news": 14.3,
-    "dolma/falcon": 456.4,
-    "dolma/megawika": 4.6,
-    "dolma/open-web-math": 12.6,
-    "dolma/pes2o": 57.2,
-    "dolma/reddit": 79.9,
-    "dolma/stackexchange": 19.6,
-    "dolma/starcoder": 263.8,
-    "dolma/flan": 16.5,
-    "dolma/wiki": 7.4,
-}
-
 
 def download_dolma_step() -> StepSpec:
     """Download the Dolma 1.7 dataset from HuggingFace."""

From 213d250305f639fe33816d82e715277eba4b2427 Mon Sep 17 00:00:00 2001
From: Rafal Wojdyla <ravwojdyla@gmail.com>
Date: Wed, 25 Mar 2026 17:31:15 -0700
Subject: [PATCH 48/56] Add download_wikipedia_step with override pointing at
 existing data

Defaults to the enwiki 20241201 dump with override_output_path
pointing at raw/wikipedia-a7dad0 where the data already lives.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .../src/marin/datakit/download/wikipedia.py   | 29 +++++++++++++++++++
 1 file changed, 29 insertions(+)

diff --git a/lib/marin/src/marin/datakit/download/wikipedia.py b/lib/marin/src/marin/datakit/download/wikipedia.py
index ec51c62b0b..1de08ba9fa 100644
--- a/lib/marin/src/marin/datakit/download/wikipedia.py
+++ b/lib/marin/src/marin/datakit/download/wikipedia.py
@@ -31,6 +31,7 @@
 
 import requests
 from iris.marin_fs import open_url
+from marin.execution.step_spec import StepSpec
 from marin.utils import fsspec_size
 from tqdm_loggable.auto import tqdm
 from zephyr import Dataset, ZephyrContext, atomic_rename, load_jsonl
@@ -111,3 +112,31 @@ def download_wikipedia(input_urls: list[str], revision: str, output_path: str) -
     )
 
     logger.info("Wikipedia dump transfer complete, wrote: %s", list(extracted))
+
+
+ENWIKI_20241201_URL = (
+    "https://dumps.wikimedia.org/other/enterprise_html/runs/20241201/" "enwiki-NS0-20241201-ENTERPRISE-HTML.json.tar.gz"
+)
+
+
+def download_wikipedia_step(
+    *,
+    input_urls: list[str] | None = None,
+    revision: str = "20241201",
+) -> StepSpec:
+    """Download Wikipedia HTML dumps.
+
+    Defaults to the English Wikipedia 20241201 dump which is already
+    downloaded at ``raw/wikipedia-a7dad0``.
+    """
+    urls = input_urls or [ENWIKI_20241201_URL]
+
+    def _run(output_path: str) -> None:
+        download_wikipedia(urls, revision, output_path)
+
+    return StepSpec(
+        name="raw/wikipedia",
+        fn=_run,
+        hash_attrs={"input_urls": urls, "revision": revision},
+        override_output_path="raw/wikipedia-a7dad0",
+    )

From 80b8a237ca3282ccc23eb89102e047299c901328 Mon Sep 17 00:00:00 2001
From: Rafal Wojdyla <ravwojdyla@gmail.com>
Date: Wed, 25 Mar 2026 18:42:47 -0700
Subject: [PATCH 49/56] Simplify download_wikipedia_step and remove revision
 param

Drop the revision nesting from download_wikipedia. The step uses
override_output_path to point at existing data when no input_urls
are provided.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .../src/marin/datakit/download/wikipedia.py   | 32 +++++++------------
 1 file changed, 11 insertions(+), 21 deletions(-)

diff --git a/lib/marin/src/marin/datakit/download/wikipedia.py b/lib/marin/src/marin/datakit/download/wikipedia.py
index 1de08ba9fa..c2ccbdbe02 100644
--- a/lib/marin/src/marin/datakit/download/wikipedia.py
+++ b/lib/marin/src/marin/datakit/download/wikipedia.py
@@ -90,16 +90,15 @@ def process_file(input_file: str, output_path: str) -> Iterable[str]:
         raise e
 
 
-def download_wikipedia(input_urls: list[str], revision: str, output_path: str) -> None:
+def download_wikipedia(input_urls: list[str], output_path: str) -> None:
     """Download and process Wikipedia data."""
     logger.info("Starting transfer of Wikipedia dump...")
-    output_base = os.path.join(output_path, revision)
 
     ctx = ZephyrContext(name="download-wikipedia")
     download_metrics = ctx.execute(
         Dataset.from_list(input_urls)
-        .map(lambda url: download_tar(url, output_base))
-        .write_jsonl(f"{output_base}/.metrics/download-{{shard:05d}}.jsonl", skip_existing=True),
+        .map(lambda url: download_tar(url, output_path))
+        .write_jsonl(f"{output_path}/.metrics/download-{{shard:05d}}.jsonl", skip_existing=True),
     )
 
     # load all of the output filenames to process
@@ -107,36 +106,27 @@ def download_wikipedia(input_urls: list[str], revision: str, output_path: str) -
 
     extracted = ctx.execute(
         Dataset.from_list(downloads)
-        .flat_map(lambda file: process_file(file, output_base))
-        .write_jsonl(f"{output_base}/.metrics/process-{{shard:05d}}.jsonl", skip_existing=True),
+        .flat_map(lambda file: process_file(file, output_path))
+        .write_jsonl(f"{output_path}/.metrics/process-{{shard:05d}}.jsonl", skip_existing=True),
     )
 
     logger.info("Wikipedia dump transfer complete, wrote: %s", list(extracted))
 
 
-ENWIKI_20241201_URL = (
-    "https://dumps.wikimedia.org/other/enterprise_html/runs/20241201/" "enwiki-NS0-20241201-ENTERPRISE-HTML.json.tar.gz"
-)
-
-
 def download_wikipedia_step(
     *,
     input_urls: list[str] | None = None,
-    revision: str = "20241201",
 ) -> StepSpec:
-    """Download Wikipedia HTML dumps.
-
-    Defaults to the English Wikipedia 20241201 dump which is already
-    downloaded at ``raw/wikipedia-a7dad0``.
-    """
-    urls = input_urls or [ENWIKI_20241201_URL]
+    """Download Wikipedia HTML dumps"""
 
     def _run(output_path: str) -> None:
-        download_wikipedia(urls, revision, output_path)
+        assert input_urls is not None, "input_urls must be provided to download Wikipedia data"
+        download_wikipedia(input_urls, output_path)
 
     return StepSpec(
         name="raw/wikipedia",
         fn=_run,
-        hash_attrs={"input_urls": urls, "revision": revision},
-        override_output_path="raw/wikipedia-a7dad0",
+        hash_attrs={"input_urls": input_urls},
+        # NOTE: if no inputs are provided, use the previously downloaded and no longer exposed 2024-12-01 data
+        override_output_path="raw/wikipedia-9273e1" if input_urls is None else None,
     )

From 3f85e10c9224fa49443782d83c60ee5c819fcd6d Mon Sep 17 00:00:00 2001
From: Rafal Wojdyla <ravwojdyla@gmail.com>
Date: Wed, 25 Mar 2026 18:47:06 -0700
Subject: [PATCH 50/56] Wire download_wikipedia_step into exp934 as StepSpec
 dependency

The Wikipedia transform step is now a StepSpec with the download
step as a dep, replacing the hardcoded mirrored() path. Converted
to .as_executor_step().cd("20241201") for backward compat with
downstream consumers.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 experiments/exp934_hq_vs_pt.py                | 39 +++++++++++--------
 .../src/marin/datakit/download/wikipedia.py   |  2 +-
 2 files changed, 23 insertions(+), 18 deletions(-)

diff --git a/experiments/exp934_hq_vs_pt.py b/experiments/exp934_hq_vs_pt.py
index 56a385cb32..fa375a30e8 100644
--- a/experiments/exp934_hq_vs_pt.py
+++ b/experiments/exp934_hq_vs_pt.py
@@ -8,7 +8,9 @@
 datasets used by various training experiments.
 """
 
+from marin.datakit.download.wikipedia import download_wikipedia_step
 from marin.execution.executor import ExecutorStep, mirrored, this_output_path, versioned
+from marin.execution.step_spec import StepSpec
 from marin.schemas.web.convert import HtmlToMarkdownConfig, ResiliparseConfig
 from marin.schemas.web.selectors import ARXIV_BLACKLISTED_SELECTORS, WIKI_BLACKLISTED_SELECTORS
 from marin.transform.ar5iv.transform_ar5iv import Ar5ivExtractionConfig, process_ar5iv_dump
@@ -42,30 +44,33 @@
     ),
 ).with_output_path("documents/stackexchange-resiliparse-custom-fork-ab41ad")
 
-# Wikipedia resiliparse custom fork step (data already exists at hardcoded path)
-wikipedia_resiliparse_custom_fork = (
-    ExecutorStep(
-        name="documents/wikipedia-resiliparse-custom-fork",
-        fn=process_wiki_dump,
-        config=WikiExtractionConfig(
-            input_path=mirrored("raw/wikipedia-a7dad0/20241201", budget_gb=1),
-            revision=versioned("20241201"),
-            output_path=this_output_path(),
+_wikipedia_download = download_wikipedia_step()
+
+# Wikipedia resiliparse custom fork step
+_wikipedia_transform = StepSpec(
+    name="documents/wikipedia-resiliparse-custom-fork",
+    fn=lambda output_path: process_wiki_dump(
+        WikiExtractionConfig(
+            input_path=f"{_wikipedia_download.output_path}/20241201",
+            revision="20241201",
+            output_path=output_path,
             extract_method="resiliparse",
             extract_config=ResiliparseConfig(
                 links=False,
                 skip_elements=WIKI_BLACKLISTED_SELECTORS,
                 markdownify_config=HtmlToMarkdownConfig(include_images=False, include_links=False),
             ),
-            remove_reference_section=versioned(True),
-            digit_threshold=versioned(50),
-            word_threshold=versioned(70),
-            special_char_threshold=versioned(50),
-        ),
-    )
-    .with_output_path("documents/wikipedia-resiliparse-custom-fork-2569de")
-    .cd("20241201")
+            remove_reference_section=True,
+            digit_threshold=50,
+            word_threshold=70,
+            special_char_threshold=50,
+        )
+    ),
+    deps=[_wikipedia_download],
+    hash_attrs={"revision": "20241201", "extract_method": "resiliparse"},
+    override_output_path="documents/wikipedia-resiliparse-custom-fork-2569de",
 )
+wikipedia_resiliparse_custom_fork = _wikipedia_transform.as_executor_step().cd("20241201")
 
 # ar5iv resiliparse custom fork step (data already exists at hardcoded path)
 ar5iv_no_problem_resiliparse_custom_fork = ExecutorStep(
diff --git a/lib/marin/src/marin/datakit/download/wikipedia.py b/lib/marin/src/marin/datakit/download/wikipedia.py
index c2ccbdbe02..683c5735aa 100644
--- a/lib/marin/src/marin/datakit/download/wikipedia.py
+++ b/lib/marin/src/marin/datakit/download/wikipedia.py
@@ -128,5 +128,5 @@ def _run(output_path: str) -> None:
         fn=_run,
         hash_attrs={"input_urls": input_urls},
         # NOTE: if no inputs are provided, use the previously downloaded and no longer exposed 2024-12-01 data
-        override_output_path="raw/wikipedia-9273e1" if input_urls is None else None,
+        override_output_path="raw/wikipedia-a7dad0" if input_urls is None else None,
     )

From 5032fc6678020ded199af375b8ed051ee8e7d699 Mon Sep 17 00:00:00 2001
From: Rafal Wojdyla <ravwojdyla@gmail.com>
Date: Wed, 25 Mar 2026 18:48:59 -0700
Subject: [PATCH 51/56] Fix Wikipedia download override path to
 wikipedia-9273e1

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 lib/marin/src/marin/datakit/download/wikipedia.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/marin/src/marin/datakit/download/wikipedia.py b/lib/marin/src/marin/datakit/download/wikipedia.py
index 683c5735aa..c2ccbdbe02 100644
--- a/lib/marin/src/marin/datakit/download/wikipedia.py
+++ b/lib/marin/src/marin/datakit/download/wikipedia.py
@@ -128,5 +128,5 @@ def _run(output_path: str) -> None:
         fn=_run,
         hash_attrs={"input_urls": input_urls},
         # NOTE: if no inputs are provided, use the previously downloaded and no longer exposed 2024-12-01 data
-        override_output_path="raw/wikipedia-a7dad0" if input_urls is None else None,
+        override_output_path="raw/wikipedia-9273e1" if input_urls is None else None,
     )

From d99a39290472cbf506e3e1877655c02508676626 Mon Sep 17 00:00:00 2001
From: Rafal Wojdyla <ravwojdyla@gmail.com>
Date: Wed, 25 Mar 2026 18:58:54 -0700
Subject: [PATCH 52/56] Restore revision parameter in download_wikipedia

The revision creates a subdirectory under output_path for the dump
data, matching the original behavior.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .../src/marin/datakit/download/wikipedia.py   | 20 ++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/lib/marin/src/marin/datakit/download/wikipedia.py b/lib/marin/src/marin/datakit/download/wikipedia.py
index c2ccbdbe02..a1a158ff76 100644
--- a/lib/marin/src/marin/datakit/download/wikipedia.py
+++ b/lib/marin/src/marin/datakit/download/wikipedia.py
@@ -90,15 +90,16 @@ def process_file(input_file: str, output_path: str) -> Iterable[str]:
         raise e
 
 
-def download_wikipedia(input_urls: list[str], output_path: str) -> None:
+def download_wikipedia(input_urls: list[str], revision: str, output_path: str) -> None:
     """Download and process Wikipedia data."""
     logger.info("Starting transfer of Wikipedia dump...")
+    output_base = os.path.join(output_path, revision)
 
     ctx = ZephyrContext(name="download-wikipedia")
     download_metrics = ctx.execute(
         Dataset.from_list(input_urls)
-        .map(lambda url: download_tar(url, output_path))
-        .write_jsonl(f"{output_path}/.metrics/download-{{shard:05d}}.jsonl", skip_existing=True),
+        .map(lambda url: download_tar(url, output_base))
+        .write_jsonl(f"{output_base}/.metrics/download-{{shard:05d}}.jsonl", skip_existing=True),
     )
 
     # load all of the output filenames to process
@@ -106,8 +107,8 @@ def download_wikipedia(input_urls: list[str], output_path: str) -> None:
 
     extracted = ctx.execute(
         Dataset.from_list(downloads)
-        .flat_map(lambda file: process_file(file, output_path))
-        .write_jsonl(f"{output_path}/.metrics/process-{{shard:05d}}.jsonl", skip_existing=True),
+        .flat_map(lambda file: process_file(file, output_base))
+        .write_jsonl(f"{output_base}/.metrics/process-{{shard:05d}}.jsonl", skip_existing=True),
     )
 
     logger.info("Wikipedia dump transfer complete, wrote: %s", list(extracted))
@@ -116,17 +117,18 @@ def download_wikipedia(input_urls: list[str], output_path: str) -> None:
 def download_wikipedia_step(
     *,
     input_urls: list[str] | None = None,
+    revision: str = "20241201",
 ) -> StepSpec:
-    """Download Wikipedia HTML dumps"""
+    """Download Wikipedia HTML dumps."""
 
     def _run(output_path: str) -> None:
         assert input_urls is not None, "input_urls must be provided to download Wikipedia data"
-        download_wikipedia(input_urls, output_path)
+        download_wikipedia(input_urls, revision, output_path)
 
     return StepSpec(
         name="raw/wikipedia",
         fn=_run,
-        hash_attrs={"input_urls": input_urls},
-        # NOTE: if no inputs are provided, use the previously downloaded and no longer exposed 2024-12-01 data
+        hash_attrs={"input_urls": input_urls, "revision": revision},
+        # NOTE: if no inputs are provided, use the previously downloaded 2024-12-01 data
         override_output_path="raw/wikipedia-9273e1" if input_urls is None else None,
     )

From 845739fac564151a92f7395b8e6d0f60dbfcc69a Mon Sep 17 00:00:00 2001
From: Rafal Wojdyla <ravwojdyla@gmail.com>
Date: Wed, 25 Mar 2026 19:00:49 -0700
Subject: [PATCH 53/56] Make revision required in download_wikipedia_step

Both input_urls and revision must be explicitly provided for new
downloads. Existing data is still accessed via override_output_path
when neither is set.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 lib/marin/src/marin/datakit/download/wikipedia.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/lib/marin/src/marin/datakit/download/wikipedia.py b/lib/marin/src/marin/datakit/download/wikipedia.py
index a1a158ff76..a989b1ea97 100644
--- a/lib/marin/src/marin/datakit/download/wikipedia.py
+++ b/lib/marin/src/marin/datakit/download/wikipedia.py
@@ -117,12 +117,13 @@ def download_wikipedia(input_urls: list[str], revision: str, output_path: str) -
 def download_wikipedia_step(
     *,
     input_urls: list[str] | None = None,
-    revision: str = "20241201",
+    revision: str | None = None,
 ) -> StepSpec:
     """Download Wikipedia HTML dumps."""
 
     def _run(output_path: str) -> None:
         assert input_urls is not None, "input_urls must be provided to download Wikipedia data"
+        assert revision is not None, "revision must be provided to download Wikipedia data"
         download_wikipedia(input_urls, revision, output_path)
 
     return StepSpec(

From d5499904746bbb6c2926702bc95f84404f1725ab Mon Sep 17 00:00:00 2001
From: Rafal Wojdyla <ravwojdyla@gmail.com>
Date: Wed, 25 Mar 2026 19:03:47 -0700
Subject: [PATCH 54/56] Wire ar5iv_step into exp934 as StepSpec dependency

Same pattern as Wikipedia: download step with override pointing at
existing data, transform step as StepSpec with download as dep.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 experiments/exp934_hq_vs_pt.py | 41 +++++++++++++++++++++-------------
 1 file changed, 26 insertions(+), 15 deletions(-)

diff --git a/experiments/exp934_hq_vs_pt.py b/experiments/exp934_hq_vs_pt.py
index fa375a30e8..d062454ae3 100644
--- a/experiments/exp934_hq_vs_pt.py
+++ b/experiments/exp934_hq_vs_pt.py
@@ -8,6 +8,7 @@
 datasets used by various training experiments.
 """
 
+from marin.datakit.download.ar5iv import ar5iv_step
 from marin.datakit.download.wikipedia import download_wikipedia_step
 from marin.execution.executor import ExecutorStep, mirrored, this_output_path, versioned
 from marin.execution.step_spec import StepSpec
@@ -72,23 +73,33 @@
 )
 wikipedia_resiliparse_custom_fork = _wikipedia_transform.as_executor_step().cd("20241201")
 
-# ar5iv resiliparse custom fork step (data already exists at hardcoded path)
-ar5iv_no_problem_resiliparse_custom_fork = ExecutorStep(
+_ar5iv_download = ar5iv_step(
+    input_path="gs://marin-us-central2/raw/ar5iv/ar5iv-04-2024-no-problem.zip",
+    override_output_path="raw/ar5iv/ar5iv-04-2024-no-problem-49c4e3",
+)
+
+# ar5iv resiliparse custom fork step
+_ar5iv_transform = StepSpec(
     name="documents/ar5iv/ar5iv-04-2024-no-problem",
-    fn=process_ar5iv_dump,
-    config=Ar5ivExtractionConfig(
-        input_path=mirrored("raw/ar5iv/ar5iv-04-2024-no-problem-49c4e3/202404", budget_gb=1),
-        revision="042024",
-        output_path=this_output_path("resiliparse-custom-fork"),
-        extract_method=versioned("resiliparse"),
-        extract_config=ResiliparseConfig(
-            links=versioned(False),
-            prepend_title=True,
-            skip_elements=ARXIV_BLACKLISTED_SELECTORS,
-        ),
-        remove_reference_section=versioned(True),
+    fn=lambda output_path: process_ar5iv_dump(
+        Ar5ivExtractionConfig(
+            input_path=f"{_ar5iv_download.output_path}/202404",
+            revision="042024",
+            output_path=output_path,
+            extract_method="resiliparse",
+            extract_config=ResiliparseConfig(
+                links=False,
+                prepend_title=True,
+                skip_elements=ARXIV_BLACKLISTED_SELECTORS,
+            ),
+            remove_reference_section=True,
+        )
     ),
-).with_output_path("documents/ar5iv/ar5iv-04-2024-no-problem-3971f")
+    deps=[_ar5iv_download],
+    hash_attrs={"revision": "042024", "extract_method": "resiliparse"},
+    override_output_path="documents/ar5iv/ar5iv-04-2024-no-problem-3971f",
+)
+ar5iv_no_problem_resiliparse_custom_fork = _ar5iv_transform.as_executor_step()
 
 # MMLU Science QA tokenization
 medu_mmlu_science_qa_tokenized = default_tokenize(

From eae62faa186d4e2612833c7123cce34c0f00f2ad Mon Sep 17 00:00:00 2001
From: Rafal Wojdyla <ravwojdyla@gmail.com>
Date: Wed, 25 Mar 2026 19:50:29 -0700
Subject: [PATCH 55/56] Remove unused download entries from simple.py

fineweb (never downloaded), the_stack_dedup, and
the_pile_openwebtext2 have no consumers.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 experiments/pretraining_datasets/simple.py | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/experiments/pretraining_datasets/simple.py b/experiments/pretraining_datasets/simple.py
index 5fa9a5fa65..9b360e8f61 100644
--- a/experiments/pretraining_datasets/simple.py
+++ b/experiments/pretraining_datasets/simple.py
@@ -69,7 +69,6 @@ def _build_downloads() -> dict[str, ExecutorStep | InputName]:
     fineweb_edu_base = _dl("raw/fineweb-edu", "HuggingFaceFW/fineweb-edu", "87f0914", "raw/fineweb-edu-87f0914")
 
     return {
-        "fineweb": _dl("raw/fineweb", "HuggingFaceFW/fineweb", "cd85054", "raw/fineweb"),
         "fineweb_edu": fineweb_edu_base.cd("data"),
         "fineweb_edu_sample_10bt": fineweb_edu_base.cd("sample/10BT"),
         "fineweb_edu_sample_100bt": fineweb_edu_base.cd("sample/100BT"),
@@ -93,19 +92,11 @@ def _build_downloads() -> dict[str, ExecutorStep | InputName]:
         "dclm_baseline": (
             _dl("raw/dclm-baseline-1.0", "mlfoundations/dclm-baseline-1.0", "a3b142c", "raw/dclm").cd("a3b142c")
         ),
-        "the_stack_dedup": (
-            _dl("raw/the-stack-dedup", "bigcode/the-stack-dedup", "17cad72", "raw/the-stack-dedup-4ba450").cd("17cad72")
-        ),
         "proofpile_2": (
             _dl("raw/proof-pile-2", "EleutherAI/proof-pile-2", "901a927", "raw/proof-pile-2-f1b1d8").cd(
                 "901a927/huggingface.co/datasets/EleutherAI/proof-pile-2/resolve/901a927"
             )
         ),
-        "the_pile_openwebtext2": (
-            _dl("raw/the_pile_openwebtext2", "vietgpt/the_pile_openwebtext2", "1de27c6", "raw/the_pile_openwebtext2").cd(
-                "1de27c6/huggingface.co/datasets/vietgpt/the_pile_openwebtext2/resolve/1de27c6"
-            )
-        ),
         "starcoderdata": _dl("raw/starcoderdata", "bigcode/starcoderdata", "9fc30b5", "raw/starcoderdata-720c8c"),
     }
 

From 6e750844034583d5c9de40cb7cdffed3784b152c Mon Sep 17 00:00:00 2001
From: Rafal Wojdyla <ravwojdyla@gmail.com>
Date: Thu, 26 Mar 2026 16:22:15 -0700
Subject: [PATCH 56/56] Address PR review comments

- Add append_sha_to_path to download_hf_step, fix dolma3 download
  which writes files under {output_path}/{revision} (P1 fix)
- Flatten ar5iv from package to single ar5iv.py, delete unused
  ar5iv-v04-2024.json data file

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 experiments/pretraining_datasets/simple.py    | 16 +++++++++---
 .../download/{ar5iv/download.py => ar5iv.py}  |  0
 .../marin/datakit/download/ar5iv/__init__.py  |  7 -----
 .../download/ar5iv/ar5iv-v04-2024.json        | 26 -------------------
 .../src/marin/datakit/download/huggingface.py |  4 +++
 5 files changed, 17 insertions(+), 36 deletions(-)
 rename lib/marin/src/marin/datakit/download/{ar5iv/download.py => ar5iv.py} (100%)
 delete mode 100644 lib/marin/src/marin/datakit/download/ar5iv/__init__.py
 delete mode 100644 lib/marin/src/marin/datakit/download/ar5iv/ar5iv-v04-2024.json

diff --git a/experiments/pretraining_datasets/simple.py b/experiments/pretraining_datasets/simple.py
index 9b360e8f61..1e3e4ff6e7 100644
--- a/experiments/pretraining_datasets/simple.py
+++ b/experiments/pretraining_datasets/simple.py
@@ -53,10 +53,16 @@ def _tokenize_simple(
     return step
 
 
-def _dl(name: str, hf_dataset_id: str, revision: str, output_path: str) -> ExecutorStep:
+def _dl(
+    name: str, hf_dataset_id: str, revision: str, output_path: str, *, append_sha_to_path: bool = False
+) -> ExecutorStep:
     """Create a download ExecutorStep from a StepSpec."""
     return download_hf_step(
-        name, hf_dataset_id=hf_dataset_id, revision=revision, override_output_path=output_path
+        name,
+        hf_dataset_id=hf_dataset_id,
+        revision=revision,
+        append_sha_to_path=append_sha_to_path,
+        override_output_path=output_path,
     ).as_executor_step()
 
 
@@ -83,7 +89,11 @@ def _build_downloads() -> dict[str, ExecutorStep | InputName]:
         ),
         "dolma3_mix_150b_1025": (
             _dl(
-                "raw/dolma3_mix-150B-1025", "allenai/dolma3_mix-150B-1025", "15d04ee", "raw/dolma3_mix-150B-1025-15d04ee"
+                "raw/dolma3_mix-150B-1025",
+                "allenai/dolma3_mix-150B-1025",
+                "15d04ee",
+                "raw/dolma3_mix-150B-1025-15d04ee",
+                append_sha_to_path=True,
             ).cd("15d04ee")
         ),
         "dclm_baseline_wrong": _dl(
diff --git a/lib/marin/src/marin/datakit/download/ar5iv/download.py b/lib/marin/src/marin/datakit/download/ar5iv.py
similarity index 100%
rename from lib/marin/src/marin/datakit/download/ar5iv/download.py
rename to lib/marin/src/marin/datakit/download/ar5iv.py
diff --git a/lib/marin/src/marin/datakit/download/ar5iv/__init__.py b/lib/marin/src/marin/datakit/download/ar5iv/__init__.py
deleted file mode 100644
index 5d820ef55f..0000000000
--- a/lib/marin/src/marin/datakit/download/ar5iv/__init__.py
+++ /dev/null
@@ -1,7 +0,0 @@
-# Copyright The Marin Authors
-# SPDX-License-Identifier: Apache-2.0
-
-from marin.datakit.download.ar5iv.download import Ar5ivDownloadConfig as Ar5ivDownloadConfig
-from marin.datakit.download.ar5iv.download import ar5iv_step as ar5iv_step
-from marin.datakit.download.ar5iv.download import download as download
-from marin.datakit.download.ar5iv.download import process_shard as process_shard
diff --git a/lib/marin/src/marin/datakit/download/ar5iv/ar5iv-v04-2024.json b/lib/marin/src/marin/datakit/download/ar5iv/ar5iv-v04-2024.json
deleted file mode 100644
index 7c178afb61..0000000000
--- a/lib/marin/src/marin/datakit/download/ar5iv/ar5iv-v04-2024.json
+++ /dev/null
@@ -1,26 +0,0 @@
-{
-  "dataset": "ar5iv",
-  "version": "v04.2024",
-  "links": [
-      {
-        "name": "C-UDA-1.0.md",
-        "url": "",
-        "checksum": {"type":  "md5", "encoding":  "hex", "hash":  "0476ea786ce0e3291f6eaaabc43e250e"}
-      },
-      {
-        "name": "ar5iv-04-2024-errors.zip",
-        "url": "",
-        "checksum": {"type":  "md5", "encoding":  "hex", "hash":  "9178d9635085a657956402077b4f8301"}
-      },
-      {
-        "name": "ar5iv-04-2024-no-problem.zip",
-        "url":  "",
-        "checksum":  {"type":  "md5", "encoding":  "hex", "hash":  "6ffa80fa273f29716527db36e1841abf"}
-      },
-      {
-        "name": "ar5iv-04-2024-warnings.zip",
-        "url": "",
-        "checksum": {"type":  "md5", "encoding":  "hex", "hash":  "51582b218f55286e5fe08431eb5e299d"}
-      }
-  ]
-}
diff --git a/lib/marin/src/marin/datakit/download/huggingface.py b/lib/marin/src/marin/datakit/download/huggingface.py
index c414df96a9..f6ee228cd5 100644
--- a/lib/marin/src/marin/datakit/download/huggingface.py
+++ b/lib/marin/src/marin/datakit/download/huggingface.py
@@ -349,6 +349,7 @@ def download_hf_step(
     hf_dataset_id: str,
     revision: str,
     hf_urls_glob: list[str] | None = None,
+    append_sha_to_path: bool = False,
     zephyr_max_parallelism: int = 8,
     deps: list[StepSpec] | None = None,
     override_output_path: str | None = None,
@@ -362,6 +363,7 @@ def download_hf_step(
         hf_dataset_id: HuggingFace dataset identifier (e.g. "HuggingFaceFW/fineweb").
         revision: Commit hash from the HF dataset repo.
         hf_urls_glob: Glob patterns to select specific files. Empty means all files.
+        append_sha_to_path: If True, write outputs under ``output_path/<revision>``.
         zephyr_max_parallelism: Maximum download parallelism.
         deps: Optional upstream dependencies.
         override_output_path: Override the computed output path entirely.
@@ -378,6 +380,7 @@ def _run(output_path: str) -> None:
                 revision=revision,
                 hf_urls_glob=resolved_glob,
                 gcs_output_path=output_path,
+                append_sha_to_path=append_sha_to_path,
                 zephyr_max_parallelism=zephyr_max_parallelism,
             )
         )
@@ -390,6 +393,7 @@ def _run(output_path: str) -> None:
             "hf_dataset_id": hf_dataset_id,
             "revision": revision,
             "hf_urls_glob": resolved_glob,
+            "append_sha_to_path": append_sha_to_path,
         },
         override_output_path=override_output_path,
     )