From 7a242296ac6cba31783af68ffb918451f8f17245 Mon Sep 17 00:00:00 2001 From: Rafal Wojdyla Date: Mon, 9 Mar 2026 13:11:11 -0700 Subject: [PATCH 01/56] Stage datakit design doc --- docs/design/2355_datakit.md | 218 ++++++++++++++++++++++++++++++++++++ 1 file changed, 218 insertions(+) create mode 100644 docs/design/2355_datakit.md diff --git a/docs/design/2355_datakit.md b/docs/design/2355_datakit.md new file mode 100644 index 0000000000..7ef15bf46e --- /dev/null +++ b/docs/design/2355_datakit.md @@ -0,0 +1,218 @@ +Marin has most of the pieces for end-to-end data processing \- download, dedup, filtering, classification, decontamination, tokenization \- but the code is scattered across `experiments/` and `lib/marin/` with inconsistent formats, ad-hoc ID handling, and unclear provenance. + +We propose consolidating this into **datakit**: a set of composable pipeline stages with standardized formats and conventions, living in `lib/marin/datakit/`. Dataset-specific wiring (e.g., "for Arxiv, apply these transforms") lives in `experiments/` or reference configurations. + + +Links: + * [marin\#2355](https://github.com/marin-community/marin/issues/2355) + * [gdoc](https://docs.google.com/document/d/1kDSzONg32zv2VnCO4FJiMP0fcjRSjgP0uTDpI4_C4O0) + +# Golden Path + +The canonical pipeline for getting a dataset from source to training: + +`Download → Normalize → Embed → Classify/Filter → Dedup → Tokenize` + +Notably, datakit in the proposed form, doesn’t include **data mixing** or **training**. + +## 1\. Download + +Download raw dataset from Hugging Face (or other sources). Raw downloads are preserved as-is in their original format and directory structure. + +## 2\. Normalize to Standard Format + +Convert raw data into the **datakit standard format**: + +* **File format**: Vortex \- columnar, supports pushdown filters and column projection, efficient lookup. +* **Mandatory columns**: + * `id` \- unique document identifier (see [ID Column](#id-column) below) + * `text` \- primary text content \- we enforce UTF-8 +* **Arbitrary additional columns**: any fields present in the raw data are preserved +* **Directory structure**: preserver original directory structure +* **Partition structure**: partition layout from the source does NOT need to be preserved at this point \- and in most cases it will not be + * We may want to introduce a more efficient partitioning at this stage and preserve the new partitioning until tokenization + * The partitions must follow `part-x-of-y` suffix naming convention +* **Sort invariant**: each partition is sorted by `id` +* **Typed output:** in the code the data has typed representation via `Artifact` + +This is the "intake" step \- all downstream stages operate on normalized Vortex datasets. + +## 3\. Embed + +Produce vector embeddings for each document. Output is an **attributes dataset** (see [Attributes Datasets](#attributes-datasets)) with embedding vectors keyed by `id`. + +## 4\. Quality Classification, Topic Assignment + +Each classifier produces an **attributes dataset** containing scores/labels keyed by `id`. + +## 5\. Deduplication + +Produces an **attributes dataset** marking duplicate spans or documents. + +## 7\. Consolidation + +Join attributes datasets back to the source documents and apply filters: + +* Filter by classifier thresholds (e.g., quality score \> 0.8) +* Remove duplicate spans/documents + +Output is a clean, filtered Vortex dataset \- still sorted by `id`, still co-partitioned. + +## 8\. Tokenize + +Convert clean text into tokenized Levanter cache format. + +**Tokenization is the boundary where per-document structure ends.** The tokenizer concatenates documents into fixed-size token sequences for efficient training. Partition structure from earlier stages does not carry through \- the output is sharded Levanter TreeStore caches with a `.stats.json` summary. + +# Core Design Decisions + +## Vortex as the Standard Format + +All intermediate datasets (from normalization through consolidation) use the Vortex columnar format. Benefits: + +* Column projection (only read the columns you need) +* Filter pushdown +* Efficient sorted merge joins via Zephyr + +NOTE: Vortex is much less mature than Parquet. This is a major concern. We will start with Vortex and if we hit roadblocks, revert to Parquet. + +## ID Column {#id-column} + +* **Preserve existing IDs** when present in the raw data (e.g., WARC-Record-ID in DCLM, HF row indices). These carry provenance meaning and aid debugging. + * But rename column to `source_id` +* **Generate deterministic IDs** via content hash. Column named `id`. Deterministic hashing ensures reproducibility \- re-running the pipeline produces the same IDs, which preserves caching and diffing. + +## Co-Partitioning Invariant + +The key invariant that enables efficient joins: **Attributes datasets must have the same number of shards and the same key-range partitioning as their source dataset.** + +This means: + +* The normalization step determines the partition structure +* All downstream stages (embed, classify, dedup) preserve this structure \- same shard count, same ID ranges per shard +* Consolidation can use Zephyr's `sorted_merge_join` without a costly `group_by` shuffle + +This is enforced by convention: each processing stage reads source partitions 1:1 and writes output partitions with matching structure. + +## Attributes Datasets {#attributes-datasets} + +Processing stages (embed, classify, dedup) produce **attributes datasets** \- lightweight Vortex files containing: + +* `id` — matching the source document ID +* Stage-specific output columns (e.g., `quality_score`, `is_duplicate`, `topic_label`) + +Attributes datasets: + +* Use Vortex format +* Are co-partitioned with the source (same shard count and key ranges) +* Are sorted by `id` within each partition +* Can be joined back to source documents via `sorted_merge_join` + +Multiple attribute datasets from different stages can be joined together during consolidation to apply compound filters. + +## Step Orchestration via StepSpec + +Datakit builds on `StepSpec` \- the pure-data step descriptor that captures identity, dependencies. Each datakit stage (normalize, classify, dedup, etc.) is a `StepSpec` with: + +* **`name`**: human-readable stage name (e.g., `"fineweb/normalize"`) +* **`deps`**: upstream `StepSpec`s whose `output_path` this stage reads from +* **`hash_attrs`**: configuration values that affect output (model name, thresholds, etc.) — changes invalidate the cache +* **`fn`**: the callable that performs the work, receiving `output_path` as its argument + +`StepSpec` gives us automatic cache invalidation (via `hash_id` derived from name \+ attrs \+ dep paths), dependency tracking, and deterministic output paths. The step runner handles locking, heartbeats, and status \- datakit stages just describe what to run. + +Example wiring: + +```py +download = StepSpec( + name="fineweb/download", + fn=lambda output_path: download_hf(output_path=output_path, dataset_id="HuggingFaceFW/fineweb"), + hash_attrs={"dataset_id": "HuggingFaceFW/fineweb", "revision": "abc1234"}, +) + +normalize = StepSpec( + name="fineweb/normalize", + deps=[download], + fn=lambda output_path: normalize_to_vortex( + input_path=download.output_path, output_path=output_path, text_field="text", + ), + hash_attrs={"text_field": "text"}, +) + +quality = StepSpec( + name="fineweb/quality", + deps=[normalize], + fn=lambda output_path: classify( + input_path=normalize.output_path, output_path=output_path, model="fasttext-quality-v1", + ), + hash_attrs={"model": "fasttext-quality-v1"}, +) + +dedup = StepSpec( + name="fineweb/dedup", + deps=[normalize], + fn=lambda output_path: deduplicate( + input_path=normalize.output_path, output_path=output_path, mode="fuzzy_document", + ), + hash_attrs={"mode": "fuzzy_document"}, +) + +consolidated = StepSpec( + name="fineweb/consolidated", + deps=[normalize, quality, dedup], + fn=lambda output_path: consolidate( + source_path=normalize.output_path, + attribute_paths=[quality.output_path, dedup.output_path], + output_path=output_path, + quality_threshold=0.8, + ), + hash_attrs={"quality_threshold": 0.8}, +) + +tokenized = StepSpec( + name="fineweb/tokenized", + deps=[consolidated], + fn=lambda output_path: tokenize( + input_path=consolidated.output_path, output_path=output_path, + tokenizer="meta-llama/Llama-3.1-8B", + ), + hash_attrs={"tokenizer": "meta-llama/Llama-3.1-8B"}, +) +``` + +# API Surface + +## `lib/marin/datakit/` + +Core primitives — the reusable building blocks: + +``` +lib/marin/datakit/ + normalize # Raw format -> standard Vortex (id, text, ...) + embed # Document embedding + classify # Quality/topic classification + dedup # Deduplication (exact + fuzzy) + consolidate # Join attributes + apply filters +``` + +## `experiments/` (or reference configurations) + +Dataset-specific wiring \- which transforms to apply for a given dataset, expressed as `StepSpec` DAGs. + +# Execution Plan + +* Implement `datakit/normalize.py` \- standard schema definitions, ID generation, raw format to Vortex conversion with mandatory columns +* Integration tests for the normalize step +* Integration tests covering download, normalize, dedup and tokenize at reasonable scale +* Update Grug/ferry experiment definitions to consume datakit pipeline outputs directly + +# Non-Goals + +* **Replacing the mixing or training APIs** \- datakit standardizes everything upstream of tokenization. +* **Supporting non-text modalities** \- the initial scope is text datasets with a mandatory `text` field. Multimodal support can be added later by relaxing this constraint. + +# Open Questions + +1. **ID uniqueness enforcement**: Per-partition validation is cheap and will be the default. Should we also support global uniqueness checks? What's the failure mode — warn or error? +2. **Non-text datasets**: Code datasets, structured data \- do we need a configurable primary field, or is `text` always sufficient? +3. **Versioning**: How do we version datakit outputs so that downstream consumers (Grug) can pin to a specific processing run? `StepSpec.hash_id` provides content-based versioning, but do we need human-readable version tags as well? From 2d4ab402fd31634941a513571b88611aaf9d88ed Mon Sep 17 00:00:00 2001 From: Rafal Wojdyla Date: Mon, 23 Mar 2026 16:27:03 -0700 Subject: [PATCH 02/56] Add datakit download, normalize, and tokenize modules MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implements the first three stages of the datakit pipeline per the design doc (#2355): download_step wraps download_hf, normalize converts raw files to sorted/deduped Parquet with content-hash IDs, and tokenize_step wraps the existing tokenizer for Levanter cache output. Integration test exercises the full DAG (download → normalize → tokenize) via StepRunner. Co-Authored-By: Claude Opus 4.6 (1M context) --- lib/marin/src/marin/datakit/__init__.py | 2 + lib/marin/src/marin/datakit/download.py | 62 ++++++++ lib/marin/src/marin/datakit/normalize.py | 194 +++++++++++++++++++++++ lib/marin/src/marin/datakit/tokenize.py | 71 +++++++++ tests/datakit/__init__.py | 2 + tests/datakit/test_datakit.py | 76 +++++++++ 6 files changed, 407 insertions(+) create mode 100644 lib/marin/src/marin/datakit/__init__.py create mode 100644 lib/marin/src/marin/datakit/download.py create mode 100644 lib/marin/src/marin/datakit/normalize.py create mode 100644 lib/marin/src/marin/datakit/tokenize.py create mode 100644 tests/datakit/__init__.py create mode 100644 tests/datakit/test_datakit.py diff --git a/lib/marin/src/marin/datakit/__init__.py b/lib/marin/src/marin/datakit/__init__.py new file mode 100644 index 0000000000..ec8bc038b7 --- /dev/null +++ b/lib/marin/src/marin/datakit/__init__.py @@ -0,0 +1,2 @@ +# Copyright The Marin Authors +# SPDX-License-Identifier: Apache-2.0 diff --git a/lib/marin/src/marin/datakit/download.py b/lib/marin/src/marin/datakit/download.py new file mode 100644 index 0000000000..0724472143 --- /dev/null +++ b/lib/marin/src/marin/datakit/download.py @@ -0,0 +1,62 @@ +# Copyright The Marin Authors +# SPDX-License-Identifier: Apache-2.0 + +"""Datakit download stage — fetch a HuggingFace dataset to persistent storage.""" + +from marin.download.huggingface.download_hf import DownloadConfig, download_hf +from marin.execution.step_spec import StepSpec + + +def download_step( + name: str, + *, + hf_dataset_id: str, + revision: str, + hf_urls_glob: list[str] | None = None, + zephyr_max_parallelism: int = 8, + deps: list[StepSpec] | None = None, + output_path_prefix: str | None = None, + override_output_path: str | None = None, +) -> StepSpec: + """Create a StepSpec that downloads a HuggingFace dataset. + + The raw download is preserved as-is in its original format and directory structure. + + Args: + name: Step name (e.g. "fineweb/download"). + hf_dataset_id: HuggingFace dataset identifier (e.g. "HuggingFaceFW/fineweb"). + revision: Commit hash from the HF dataset repo. + hf_urls_glob: Glob patterns to select specific files. Empty means all files. + zephyr_max_parallelism: Maximum download parallelism. + deps: Optional upstream dependencies. + output_path_prefix: Override the default output path prefix. + override_output_path: Override the computed output path entirely. + + Returns: + A StepSpec whose output_path contains the raw downloaded files. + """ + resolved_glob = hf_urls_glob or [] + + def _run(output_path: str) -> None: + download_hf( + DownloadConfig( + hf_dataset_id=hf_dataset_id, + revision=revision, + hf_urls_glob=resolved_glob, + gcs_output_path=output_path, + zephyr_max_parallelism=zephyr_max_parallelism, + ) + ) + + return StepSpec( + name=name, + fn=_run, + deps=deps or [], + hash_attrs={ + "hf_dataset_id": hf_dataset_id, + "revision": revision, + "hf_urls_glob": resolved_glob, + }, + output_path_prefix=output_path_prefix, + override_output_path=override_output_path, + ) diff --git a/lib/marin/src/marin/datakit/normalize.py b/lib/marin/src/marin/datakit/normalize.py new file mode 100644 index 0000000000..bace847696 --- /dev/null +++ b/lib/marin/src/marin/datakit/normalize.py @@ -0,0 +1,194 @@ +# Copyright The Marin Authors +# SPDX-License-Identifier: Apache-2.0 + +"""Datakit normalize stage — convert raw data into the datakit standard Parquet format. + +The normalize step is the "intake" for the datakit pipeline. It reads raw files +(JSONL, Parquet, or other formats supported by Zephyr), enforces a standard +schema (mandatory ``id`` and ``text`` columns), and writes co-partitioned, +sorted Parquet files. + +Key guarantees after normalization: +- Every record has a deterministic ``id`` (SHA-256 of the text content). +- If the source data has an existing ID field, it is preserved as ``source_id``. +- Text is present and UTF-8 encoded. +- Each output partition is sorted by ``id``. +- Output files follow the ``part-{shard:05d}-of-{total:05d}.parquet`` naming convention. +""" + +import hashlib +import logging +import os +from collections.abc import Iterator + +from marin.execution.artifact import PathsMetadata +from marin.execution.step_spec import StepSpec +from marin.utils import fsspec_glob +from zephyr import Dataset, ShardInfo, ZephyrContext +from zephyr.readers import load_file + +logger = logging.getLogger(__name__) + +DEFAULT_TEXT_FIELD = "text" + + +def content_hash_id(text: str) -> str: + """Generate a deterministic document ID from text content. + + Uses SHA-256 truncated to 16 hex characters for a compact but + collision-resistant identifier. + """ + return hashlib.sha256(text.encode("utf-8")).hexdigest()[:16] + + +def _discover_input_files(input_path: str) -> list[str]: + """Find all supported input files under input_path, excluding dotfiles/directories.""" + extensions = ["jsonl.gz", "jsonl.zst", "jsonl.zstd", "jsonl", "parquet", "vortex"] + files: list[str] = [] + for ext in extensions: + files.extend(fsspec_glob(os.path.join(input_path, f"**/*.{ext}"))) + # Exclude hidden directories (e.g. .metrics/ written by download_hf) + files = [f for f in files if "/." not in f.split(input_path, 1)[-1]] + if not files: + raise ValueError(f"No supported input files found under {input_path}") + return sorted(files) + + +def _normalize_record(record: dict, text_field: str, source_id_field: str | None) -> dict: + """Transform a single record into datakit standard format. + + - Extracts and renames the text field to ``text``. + - Generates a deterministic ``id`` from the text content. + - Preserves the original ID (if any) as ``source_id``. + - Preserves all other fields. + """ + text = record.get(text_field) + if text is None: + raise ValueError(f"Record missing required text field {text_field!r}: {list(record.keys())}") + if not isinstance(text, str): + text = str(text) + + doc_id = content_hash_id(text) + + normalized: dict = {"id": doc_id, "text": text} + + if source_id_field is not None and source_id_field in record: + normalized["source_id"] = str(record[source_id_field]) + + # Preserve additional columns + skip_fields = {text_field, source_id_field} if source_id_field else {text_field} + for key, value in record.items(): + if key not in skip_fields and key not in normalized: + normalized[key] = value + + return normalized + + +def normalize( + input_path: str, + output_path: str, + *, + text_field: str = DEFAULT_TEXT_FIELD, + source_id_field: str | None = None, + num_output_shards: int | None = None, + zephyr_max_workers: int = 64, +) -> PathsMetadata: + """Run the normalize pipeline. + + Reads raw files, transforms each record to the standard schema, + repartitions by ``id`` (hash-based), deduplicates, sorts each partition + by ``id``, and writes Parquet output files. + + Args: + input_path: Path to raw input files. + output_path: Directory to write output Parquet files. + text_field: Name of the field containing the primary text content. + source_id_field: Name of an existing ID field to preserve as ``source_id``. + num_output_shards: Number of output Parquet partitions. Defaults to + the number of input files. + zephyr_max_workers: Maximum Zephyr worker parallelism. + + Returns: + PathsMetadata listing the output files. + """ + input_files = _discover_input_files(input_path) + logger.info("Normalizing %d input files from %s", len(input_files), input_path) + + shards = num_output_shards or len(input_files) + + def _sort_shard(records: Iterator[dict], _shard_info: ShardInfo) -> Iterator[dict]: + batch = list(records) + batch.sort(key=lambda r: r["id"]) + return iter(batch) + + output_pattern = os.path.join(output_path, "part-{shard:05d}-of-{total:05d}.parquet") + pipeline = ( + Dataset.from_list(input_files) + .flat_map(load_file) + .map(lambda r: _normalize_record(r, text_field, source_id_field)) + .group_by( + key=lambda r: r["id"], + reducer=lambda _key, records: next(iter(records)), + num_output_shards=shards, + ) + .map_shard(_sort_shard) + .write_parquet(output_pattern) + ) + + ctx = ZephyrContext(name="datakit-normalize", max_workers=min(zephyr_max_workers, shards)) + output_files = list(ctx.execute(pipeline)) + logger.info("Wrote %d normalized Parquet partitions to %s", len(output_files), output_path) + return PathsMetadata(parent_path=output_path, paths=output_files) + + +def normalize_step( + name: str, + *, + input_path: str, + text_field: str = DEFAULT_TEXT_FIELD, + source_id_field: str | None = None, + num_output_shards: int | None = None, + zephyr_max_workers: int = 64, + deps: list[StepSpec] | None = None, + output_path_prefix: str | None = None, + override_output_path: str | None = None, +) -> StepSpec: + """Create a StepSpec for the normalize stage. + + Args: + name: Step name (e.g. "fineweb/normalize"). + input_path: Path to raw input files. + text_field: Name of the field containing the primary text content. + source_id_field: Name of an existing ID field to preserve as ``source_id``. + num_output_shards: Number of output Parquet partitions. + zephyr_max_workers: Maximum Zephyr worker parallelism. + deps: Upstream dependencies (typically the download step). + output_path_prefix: Override the default output path prefix. + override_output_path: Override the computed output path entirely. + + Returns: + A StepSpec whose output_path contains normalized Parquet files. + """ + + def _run(step_output_path: str) -> PathsMetadata: + return normalize( + input_path, + step_output_path, + text_field=text_field, + source_id_field=source_id_field, + num_output_shards=num_output_shards, + zephyr_max_workers=zephyr_max_workers, + ) + + return StepSpec( + name=name, + fn=_run, + deps=deps or [], + hash_attrs={ + "input_path": input_path, + "text_field": text_field, + "source_id_field": source_id_field, + }, + output_path_prefix=output_path_prefix, + override_output_path=override_output_path, + ) diff --git a/lib/marin/src/marin/datakit/tokenize.py b/lib/marin/src/marin/datakit/tokenize.py new file mode 100644 index 0000000000..0e5c9b4168 --- /dev/null +++ b/lib/marin/src/marin/datakit/tokenize.py @@ -0,0 +1,71 @@ +# Copyright The Marin Authors +# SPDX-License-Identifier: Apache-2.0 + +"""Datakit tokenize stage — convert normalized Parquet datasets into Levanter cache format. + +This is the final stage of the datakit pipeline. It reads normalized Parquet +files and produces tokenized training data in Levanter's TreeStore format. + +Tokenization is the boundary where per-document structure ends. The tokenizer +concatenates documents into fixed-size token sequences for efficient training. +""" + +import logging + +from marin.execution.step_spec import StepSpec +from marin.processing.tokenize.tokenize import TokenizeConfig, tokenize + +logger = logging.getLogger(__name__) + + +def tokenize_step( + name: str, + *, + input_path: str, + tokenizer: str, + max_workers: int = 4096, + deps: list[StepSpec] | None = None, + output_path_prefix: str | None = None, + override_output_path: str | None = None, +) -> StepSpec: + """Create a StepSpec that tokenizes a normalized dataset. + + Reads normalized Parquet files and produces Levanter cache format output + suitable for training. + + Args: + name: Step name (e.g. "fineweb/tokenize"). + input_path: Path to normalized Parquet files (output of normalize step). + tokenizer: HuggingFace tokenizer name (e.g. "meta-llama/Llama-3.1-8B"). + max_workers: Maximum Zephyr worker parallelism. + deps: Upstream dependencies (typically the normalize or consolidate step). + output_path_prefix: Override the default output path prefix. + override_output_path: Override the computed output path entirely. + + Returns: + A StepSpec whose output_path contains the tokenized Levanter cache. + """ + + def _run(output_path: str) -> None: + tokenize( + TokenizeConfig( + train_paths=[input_path], + validation_paths=[], + cache_path=output_path, + tokenizer=tokenizer, + max_workers=max_workers, + allow_test_in_train=True, + ) + ) + + return StepSpec( + name=name, + fn=_run, + deps=deps or [], + hash_attrs={ + "input_path": input_path, + "tokenizer": tokenizer, + }, + output_path_prefix=output_path_prefix, + override_output_path=override_output_path, + ) diff --git a/tests/datakit/__init__.py b/tests/datakit/__init__.py new file mode 100644 index 0000000000..ec8bc038b7 --- /dev/null +++ b/tests/datakit/__init__.py @@ -0,0 +1,2 @@ +# Copyright The Marin Authors +# SPDX-License-Identifier: Apache-2.0 diff --git a/tests/datakit/test_datakit.py b/tests/datakit/test_datakit.py new file mode 100644 index 0000000000..1c29e35a9c --- /dev/null +++ b/tests/datakit/test_datakit.py @@ -0,0 +1,76 @@ +# Copyright The Marin Authors +# SPDX-License-Identifier: Apache-2.0 + +"""Integration test for the datakit pipeline: download → normalize → tokenize, wired as StepSpecs.""" + +from pathlib import Path + +import numpy as np +import pyarrow.parquet as pq +import pytest +from levanter.store.cache import CacheLedger, TreeCache + +from marin.datakit.download import download_step +from marin.datakit.normalize import content_hash_id, normalize_step +from marin.datakit.tokenize import tokenize_step +from marin.execution.step_runner import StepRunner + + +@pytest.mark.slow +def test_download_normalize_tokenize(tmp_path): + """Download → normalize → tokenize as a StepSpec DAG via StepRunner.""" + + dl = download_step( + "datakit/download", + hf_dataset_id="wikitext", + revision="main", + hf_urls_glob=["wikitext-2-v1/test-*.parquet"], + override_output_path=str(tmp_path / "raw"), + ) + + norm = normalize_step( + "datakit/normalize", + input_path=dl.output_path, + deps=[dl], + override_output_path=str(tmp_path / "normalized"), + ) + + tok = tokenize_step( + "datakit/tokenize", + input_path=norm.output_path, + tokenizer="gpt2", + deps=[norm], + override_output_path=str(tmp_path / "tokenized"), + ) + + StepRunner().run([dl, norm, tok]) + + # -- Verify download output -- + raw_files = [f for f in Path(dl.output_path).rglob("*") if f.is_file() and not f.name.startswith(".")] + assert len(raw_files) >= 1 + + # -- Verify normalize output -- + parquet_files = sorted(Path(norm.output_path).glob("*.parquet")) + assert len(parquet_files) >= 1 + + all_records = [] + for pf in parquet_files: + records = pq.read_table(str(pf)).to_pylist() + all_records.extend(records) + ids = [r["id"] for r in records] + assert ids == sorted(ids), f"Partition {pf.name} not sorted by id" + + assert len(all_records) > 0 + for record in all_records: + assert record["id"] == content_hash_id(record["text"]) + + # -- Verify tokenize output -- + train_dir = Path(tok.output_path) / "train" + ledger = CacheLedger.load(str(train_dir)) + assert ledger.is_finished + assert ledger.total_num_rows > 0 + + exemplar = {"input_ids": np.array([0], dtype=np.int32)} + cache = TreeCache.load(str(train_dir), exemplar=exemplar) + assert len(cache) == ledger.total_num_rows + assert len(cache[0]["input_ids"]) > 0 From 3f78ea6fc21c2e652460e79c410e23a9dcf4bb7c Mon Sep 17 00:00:00 2001 From: Rafal Wojdyla Date: Wed, 25 Mar 2026 10:01:29 -0700 Subject: [PATCH 03/56] StepSpec: auto-prefix relative override_output_path with marin_prefix When override_output_path is a relative path (no URL scheme, doesn't start with /), StepSpec.output_path now automatically prepends output_path_prefix or marin_prefix(). This matches the existing Executor behavior and enables datasets to use short relative paths like "raw/fineweb" in StepSpec definitions. Co-Authored-By: Claude Opus 4.6 (1M context) --- lib/marin/src/marin/execution/step_spec.py | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/lib/marin/src/marin/execution/step_spec.py b/lib/marin/src/marin/execution/step_spec.py index 76ef153aaa..bed4db725d 100644 --- a/lib/marin/src/marin/execution/step_spec.py +++ b/lib/marin/src/marin/execution/step_spec.py @@ -10,10 +10,18 @@ from dataclasses import dataclass from functools import cached_property from typing import Any +from urllib.parse import urlparse from iris.marin_fs import marin_prefix +def _is_relative_path(url_or_path: str) -> bool: + """Return True if the path is relative (not a URL and doesn't start with /).""" + if urlparse(url_or_path).scheme: + return False + return not url_or_path.startswith("/") + + @dataclass(frozen=True) class _StepSpecMigrationConfig: """Temporary config used by ``StepSpec.as_executor_step()`` during the @@ -86,11 +94,17 @@ def name_with_hash(self) -> str: @cached_property def output_path(self) -> str: - """Output path of the step""" - if self.override_output_path is not None: - return self.override_output_path + """Output path of the step. + If ``override_output_path`` is set and relative (no URL scheme, doesn't + start with ``/``), it is automatically prefixed with ``output_path_prefix`` + or ``marin_prefix()``. + """ prefix = self.output_path_prefix or marin_prefix() + if self.override_output_path is not None: + if _is_relative_path(self.override_output_path): + return f"{prefix}/{self.override_output_path}" + return self.override_output_path return f"{prefix}/{self.name_with_hash}" def as_executor_step(self) -> ExecutorStep: # noqa: F821 From 4f84f71431401df1d7a2ba2cefcb55d5963bd234 Mon Sep 17 00:00:00 2001 From: Rafal Wojdyla Date: Wed, 25 Mar 2026 10:03:05 -0700 Subject: [PATCH 04/56] Convert datakit/download to package and move HF download modules Converts the single datakit/download.py file into a datakit/download/ package. Moves the HuggingFace download modules (download_hf, stream_remove_columns, upload_gcs_to_hf) into datakit/download/ as their canonical location. Adds download_hf_step() as the StepSpec factory function. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../src/marin/datakit/download/__init__.py | 18 + .../src/marin/datakit/download/huggingface.py | 409 ++++++++++++++++++ .../datakit/download/stream_remove_columns.py | 101 +++++ .../datakit/download/upload_gcs_to_hf.py | 364 ++++++++++++++++ 4 files changed, 892 insertions(+) create mode 100644 lib/marin/src/marin/datakit/download/__init__.py create mode 100644 lib/marin/src/marin/datakit/download/huggingface.py create mode 100644 lib/marin/src/marin/datakit/download/stream_remove_columns.py create mode 100644 lib/marin/src/marin/datakit/download/upload_gcs_to_hf.py diff --git a/lib/marin/src/marin/datakit/download/__init__.py b/lib/marin/src/marin/datakit/download/__init__.py new file mode 100644 index 0000000000..cc14fdbdf4 --- /dev/null +++ b/lib/marin/src/marin/datakit/download/__init__.py @@ -0,0 +1,18 @@ +# Copyright The Marin Authors +# SPDX-License-Identifier: Apache-2.0 + +from marin.datakit.download.huggingface import ( + DownloadConfig, + download_hf, + download_hf_step, +) + +# Backward-compat alias: download_step was the original name in the single-file module. +download_step = download_hf_step + +__all__ = [ + "DownloadConfig", + "download_hf", + "download_hf_step", + "download_step", +] diff --git a/lib/marin/src/marin/datakit/download/huggingface.py b/lib/marin/src/marin/datakit/download/huggingface.py new file mode 100644 index 0000000000..6a6ff13cd2 --- /dev/null +++ b/lib/marin/src/marin/datakit/download/huggingface.py @@ -0,0 +1,409 @@ +#!/usr/bin/env python3 +# Copyright The Marin Authors +# SPDX-License-Identifier: Apache-2.0 + +""" +A script to download a HuggingFace dataset and upload it to a specified fsspec path, +using HfFileSystem for direct streaming of data transfer. +""" + +import logging +import os +import random +import socket +import time +from dataclasses import dataclass, field + +import draccus +import huggingface_hub +from huggingface_hub import HfFileSystem +from iris.marin_fs import open_url, url_to_fs +from huggingface_hub.errors import HfHubHTTPError +from packaging.version import Version +from marin.execution.executor import THIS_OUTPUT_PATH +from marin.execution.step_spec import StepSpec +from marin.utilities.validation_utils import write_provenance_json +from zephyr import Dataset, ZephyrContext +from zephyr.writers import atomic_rename +from iris.logging import configure_logging + +logger = logging.getLogger(__name__) + +HF_PROTOCOL_PREFIX = "hf://" +HF_BUCKET_PATH_PREFIX = "buckets/" + + +@dataclass(frozen=True) +class DownloadConfig: + # fmt: off + + # HuggingFace Dataset Parameters + hf_dataset_id: str # HF Dataset to Download (as `$ORG/$DATASET` on HF Hub) + + revision: str # (Short) Commit Hash (from HF Dataset Repo; 7 characters) + hf_urls_glob: list[str] = field(default_factory=list) + # List of Glob Patterns to Match Files in HF Dataset, If empty we get all the files in a hf repo + + gcs_output_path: str = THIS_OUTPUT_PATH + """ + Path to store raw data in persistent storage (e.g. gs://$BUCKET/...). + This works with any fsspec-compatible path, but for backwards compatibility, we call it gcs_output_path. + """ + + append_sha_to_path: bool = False + """If true, write outputs under ``gcs_output_path/`` instead of directly under ``gcs_output_path``.""" + + # Job Control Parameters, used only for non-gated dataset transfers done via STS + wait_for_completion: bool = True # if True, will block until job completes + + # fmt: on + hf_repo_type_prefix: str = ( + "datasets" # The repo_type_prefix is datasets/ for datasets, + # spaces/ for spaces, and models do not need a prefix in the URL. + ) + + zephyr_max_parallelism: int = 8 + """Maximum parallelism of the Zephyr download job""" + + read_timeout_seconds: float = 120.0 + """Socket read timeout while streaming each HF file. Timeout failures trigger retries.""" + + progress_log_interval_seconds: float = 60.0 + """Log a heartbeat for each in-flight shard every N seconds while bytes are flowing.""" + + read_chunk_size_mib: int = 8 + """Chunk size for each streaming read from HF.""" + + +def _strip_hf_protocol(path: str) -> str: + return path.removeprefix(HF_PROTOCOL_PREFIX).lstrip("/") + + +def _resolve_hf_source_path(cfg: DownloadConfig) -> str: + source_path = ( + os.path.join(cfg.hf_repo_type_prefix, cfg.hf_dataset_id) if cfg.hf_repo_type_prefix else cfg.hf_dataset_id + ) + return _strip_hf_protocol(source_path) + + +def _assert_bucket_support_available(source_path: str) -> None: + if not source_path.startswith(HF_BUCKET_PATH_PREFIX): + return + + if Version(huggingface_hub.__version__) < Version("1.6.0"): + raise RuntimeError( + f"Bucket paths require huggingface_hub>=1.6.0, found {huggingface_hub.__version__}. " + "Upgrade the runtime environment to a buckets-capable huggingface_hub version." + ) + + +def _relative_path_in_source(file_path: str, source_path: str) -> str: + normalized_file = _strip_hf_protocol(file_path) + normalized_source = _strip_hf_protocol(source_path).rstrip("/") + + source_prefix = f"{normalized_source}/" + if normalized_file.startswith(source_prefix): + return normalized_file.removeprefix(source_prefix) + + source_parts = [segment for segment in normalized_source.split("/") if segment] + file_parts = [segment for segment in normalized_file.split("/") if segment] + + if len(file_parts) >= len(source_parts): + matches_source = True + for source_segment, file_segment in zip(source_parts, file_parts, strict=False): + if source_segment == file_segment: + continue + if file_segment.split("@", 1)[0] == source_segment: + continue + matches_source = False + break + + if matches_source: + return "/".join(file_parts[len(source_parts) :]) + + # Backwards-compatible fallback for historical dataset path layout. + return normalized_file.split("/", 3)[-1] + + +def ensure_fsspec_path_writable(output_path: str) -> None: + """Check if the fsspec path is writable by trying to create and delete a temporary file.""" + fs, _ = url_to_fs(output_path) + try: + fs.mkdirs(output_path, exist_ok=True) + test_path = os.path.join(output_path, "test_write_access") + with fs.open(test_path, "w") as f: + f.write("test") + fs.rm(test_path) + except Exception as e: + raise ValueError(f"No write access to fsspec path: {output_path} ({e})") from e + + +def stream_file_to_fsspec( + gcs_output_path: str, + file_path: str, + fsspec_file_path: str, + expected_size: int | None = None, + read_timeout_seconds: float = 120.0, + progress_log_interval_seconds: float = 60.0, + read_chunk_size_mib: int = 8, +): + """Stream a file from HfFileSystem to another fsspec path using atomic write. + + Uses atomic_rename to write to a temp file first, then rename on success. + This enables recovery across individual files if the job is interrupted. + + Args: + gcs_output_path: Base output path for the download. + file_path: Source file path on HuggingFace. + fsspec_file_path: Target file path on the destination filesystem. + expected_size: Expected file size in bytes for validation. If provided, + the download will fail if the downloaded size doesn't match. + """ + hf_fs = HfFileSystem(token=os.environ.get("HF_TOKEN", False)) + target_fs, _ = url_to_fs(gcs_output_path) + chunk_size = max(1, int(read_chunk_size_mib)) * 1024 * 1024 + max_retries = 20 + # 15 minutes max sleep + max_sleep = 15 * 60 + # Minimum base wait time to avoid too-fast retries + min_base_wait = 5 + + # Retry when there is an error, such as hf rate limit + last_exception = None + for attempt in range(max_retries): + try: + target_fs.mkdirs(os.path.dirname(fsspec_file_path), exist_ok=True) + bytes_written = 0 + with atomic_rename(fsspec_file_path) as temp_path: + previous_socket_timeout = socket.getdefaulttimeout() + socket.setdefaulttimeout(read_timeout_seconds) + try: + with ( + hf_fs.open(file_path, "rb", block_size=chunk_size) as src_file, + open_url(temp_path, "wb") as dest_file, + ): + start_time = time.monotonic() + next_progress_log = start_time + progress_log_interval_seconds + while True: + try: + chunk = src_file.read(chunk_size) + except TimeoutError as timeout_error: + raise TimeoutError( + f"Timed out reading from {file_path} after " + f"{read_timeout_seconds:.1f}s with {bytes_written} bytes written" + ) from timeout_error + if not chunk: + break + dest_file.write(chunk) + bytes_written += len(chunk) + now = time.monotonic() + if progress_log_interval_seconds > 0 and now >= next_progress_log: + elapsed = max(now - start_time, 1e-9) + speed_mib_s = (bytes_written / (1024**2)) / elapsed + logger.info( + f"Streaming {file_path}: {bytes_written / (1024**2):.1f} MiB written " + f"in {elapsed:.1f}s ({speed_mib_s:.2f} MiB/s)" + ) + next_progress_log = now + progress_log_interval_seconds + finally: + socket.setdefaulttimeout(previous_socket_timeout) + + # Validate file size BEFORE atomic_rename commits the file + if expected_size is not None and bytes_written != expected_size: + raise ValueError( + f"Size mismatch for {file_path}: expected {expected_size} bytes, got {bytes_written} bytes" + ) + + logger.info(f"Streamed {file_path} successfully to {fsspec_file_path} ({bytes_written} bytes)") + return {"file_path": file_path, "status": "success", "size": bytes_written} + except Exception as e: + last_exception = e + # Base wait: min 5s, then exponential: 5, 10, 20, 40, 80, 160, 320, 600 (capped) + wait_base = max(min_base_wait, min_base_wait * (2**attempt)) + + error_type = type(e).__name__ + error_msg = str(e) + status_code = -1 + + if isinstance(e, HfHubHTTPError): + status_code = e.response.status_code + TOO_MANY_REQUESTS = 429 + if status_code == TOO_MANY_REQUESTS: + # NOTE: RateLimit "api\|pages\|resolvers";r=[remaining];t=[seconds remaining until reset] + try: + rate_limit_wait = int(e.response.headers["RateLimit"].split(";")[-1].split("=")[-1]) + wait_base = max(wait_base, rate_limit_wait + 10) # Add buffer to rate limit wait + except Exception: + logger.warning("Failed to parse rate limit header, using default wait period") + + logger.warning( + f"Attempt {attempt + 1}/{max_retries} failed for {file_path}: " + f"{error_type} (status={status_code}): {error_msg}" + ) + + jitter = random.uniform(0, min(wait_base * 0.25, 30)) # Up to 25% jitter, max 30s + wait_time = min(wait_base + jitter, max_sleep) + + logger.info(f"Retrying {file_path} in {wait_time:.1f}s...") + time.sleep(wait_time) + + raise RuntimeError( + f"Failed to download {file_path} after {max_retries} attempts. " + f"Last error: {type(last_exception).__name__}: {last_exception}" + ) + + +def download_hf(cfg: DownloadConfig) -> None: + + configure_logging(level=logging.INFO) + + # Set cfg.append_sha_to_path=True to mimic the older behavior of writing to gcs_output_path/. + # Some historical datasets were written that way, so this flag keeps backwards compatibility when needed. + + # Ensure the output path is writable + try: + output_path = os.path.join(cfg.gcs_output_path, cfg.revision) if cfg.append_sha_to_path else cfg.gcs_output_path + ensure_fsspec_path_writable(output_path) + except ValueError as e: + logger.exception(f"Output path validation failed: {e}") + raise e + + # Initialize Hugging Face filesystem + logger.info("Identifying files to download from HuggingFace...") + hf_fs = HfFileSystem(token=os.environ.get("HF_TOKEN", False)) + hf_source_path = _resolve_hf_source_path(cfg) + _assert_bucket_support_available(hf_source_path) + + if not cfg.hf_urls_glob: + # We get all the files using find + files = hf_fs.find(hf_source_path, revision=cfg.revision) + else: + # Get list of files directly from HfFileSystem matching the pattern + files = [] + for hf_url_glob in cfg.hf_urls_glob: + pattern = os.path.join(hf_source_path, hf_url_glob) + files += hf_fs.glob(pattern, revision=cfg.revision) + + if not files: + raise ValueError(f"No files found for dataset `{cfg.hf_dataset_id}. Used glob patterns: {cfg.hf_urls_glob}") + + # Get file sizes for validation + logger.info("Getting file sizes for validation...") + file_sizes: dict[str, int | None] = {} + for file in files: + try: + info = hf_fs.info(file, revision=cfg.revision) + file_sizes[file] = info.get("size") or None + except Exception as e: + logger.warning(f"Could not get size for {file}: {e}") + file_sizes[file] = None # Will skip validation for this file + + download_tasks = [] + + for file in files: + try: + relative_file_path = _relative_path_in_source(file, hf_source_path) + if relative_file_path.startswith(".."): + raise ValueError(f"Computed path escapes source root: source={hf_source_path}, file={file}") + fsspec_file_path = os.path.join(output_path, relative_file_path) + expected_size = file_sizes.get(file) + download_tasks.append( + ( + output_path, + file, + fsspec_file_path, + expected_size, + cfg.read_timeout_seconds, + cfg.progress_log_interval_seconds, + cfg.read_chunk_size_mib, + ) + ) + except Exception as e: + logging.exception(f"Error preparing task for {file}: {e}") + + total_files = len(download_tasks) + total_size_gb = sum(s for s in file_sizes.values() if s is not None) / (1024**3) + logger.info(f"Total number of files to process: {total_files} ({total_size_gb:.2f} GB)") + + pipeline = ( + Dataset.from_list(download_tasks) + .map(lambda task: stream_file_to_fsspec(*task)) + .write_jsonl( + f"{cfg.gcs_output_path}/.metrics/success-part-{{shard:05d}}-of-{{total:05d}}.jsonl", skip_existing=True + ) + ) + ctx = ZephyrContext(name="download-hf", max_workers=cfg.zephyr_max_parallelism) + ctx.execute(pipeline) + + # Write Provenance JSON + write_provenance_json( + output_path, + metadata={"dataset": cfg.hf_dataset_id, "version": cfg.revision, "links": files}, + ) + + logger.info(f"Streamed all files and wrote provenance JSON; check {output_path}.") + + +def download_hf_step( + name: str, + *, + hf_dataset_id: str, + revision: str, + hf_urls_glob: list[str] | None = None, + zephyr_max_parallelism: int = 8, + deps: list[StepSpec] | None = None, + output_path_prefix: str | None = None, + override_output_path: str | None = None, +) -> StepSpec: + """Create a StepSpec that downloads a HuggingFace dataset. + + The raw download is preserved as-is in its original format and directory structure. + + Args: + name: Step name (e.g. "raw/fineweb"). + hf_dataset_id: HuggingFace dataset identifier (e.g. "HuggingFaceFW/fineweb"). + revision: Commit hash from the HF dataset repo. + hf_urls_glob: Glob patterns to select specific files. Empty means all files. + zephyr_max_parallelism: Maximum download parallelism. + deps: Optional upstream dependencies. + output_path_prefix: Override the default output path prefix. + override_output_path: Override the computed output path entirely. + + Returns: + A StepSpec whose output_path contains the raw downloaded files. + """ + resolved_glob = hf_urls_glob or [] + + def _run(output_path: str) -> None: + download_hf( + DownloadConfig( + hf_dataset_id=hf_dataset_id, + revision=revision, + hf_urls_glob=resolved_glob, + gcs_output_path=output_path, + zephyr_max_parallelism=zephyr_max_parallelism, + ) + ) + + return StepSpec( + name=name, + fn=_run, + deps=deps or [], + hash_attrs={ + "hf_dataset_id": hf_dataset_id, + "revision": revision, + "hf_urls_glob": resolved_glob, + }, + output_path_prefix=output_path_prefix, + override_output_path=override_output_path, + ) + + +@draccus.wrap() +def main(cfg: DownloadConfig) -> None: + """Download HuggingFace dataset.""" + download_hf(cfg) + + +if __name__ == "__main__": + main() diff --git a/lib/marin/src/marin/datakit/download/stream_remove_columns.py b/lib/marin/src/marin/datakit/download/stream_remove_columns.py new file mode 100644 index 0000000000..b16e3a1f1b --- /dev/null +++ b/lib/marin/src/marin/datakit/download/stream_remove_columns.py @@ -0,0 +1,101 @@ +# Copyright The Marin Authors +# SPDX-License-Identifier: Apache-2.0 + +"""Remove unnecessary columns while streaming data from huggingface.""" + +import logging +import os +from dataclasses import dataclass + +import pandas as pd +import pyarrow.parquet as pq +from huggingface_hub import HfFileSystem +from tqdm import tqdm +from zephyr import Dataset, ZephyrContext + +hf_fs = HfFileSystem() +logger = logging.getLogger(__name__) + + +def prune_stream_and_save(input_file: str, output_file: str, keep_columns: list[str]): + """ + Prunes and saves a parquet file by removing un-specified columns. + + Reads the input parquet file in batches, removes columns not in keep_columns, + and writes the result to output_file. Processing in batches avoids memory issues. + + Args: + input_file (str): Path to input parquet file on HuggingFace + output_file (str): Path where pruned parquet file will be saved + keep_columns (list[str]): List of column names to retain + """ + parquet_file = pq.ParquetFile(hf_fs.open(input_file)) + + full_df_list = [] + for batch in tqdm(parquet_file.iter_batches(batch_size=10000), desc=f"Processing {input_file}"): + df = batch.to_pandas() + + drop_columns = [col for col in df.columns if col not in keep_columns] + df = df.drop(columns=drop_columns) + + full_df_list.append(df) + + full_df = pd.concat(full_df_list) + logger.info(f"Saving pruned dataset of shape {full_df.shape} to {output_file}") + full_df.to_parquet(output_file, index=False) + + +def get_file_tasks(hf_path: str, output_path: str, keep_columns: list[str]): + """ + Generate file processing tasks for a HuggingFace subset. + + Args: + hf_path (str): The HuggingFace dataset path to load + output_path (str): The output path to save the pruned dataset + keep_columns (list[str]): The columns to keep in the pruned dataset + + Yields: + Dict with input_file, output_file, and keep_columns for each parquet file + """ + logger.info(f"Loading dataset from {hf_path}") + parquet_list = hf_fs.glob(f"{hf_path}/*.parquet") + + for file in parquet_list: + output_file = os.path.join(output_path, os.path.basename(file)) + yield {"input_file": file, "output_file": output_file, "keep_columns": keep_columns} + + +@dataclass +class DatasetConfig: + hf_repo_id: str + hf_revision: str + hf_paths: list[str] + output_path: str + keep_columns: list[str] + + +def prune_hf_dataset(cfg: DatasetConfig): + logger.info(f"Starting dataset pruning for {cfg.hf_paths}") + + # Build list of subset paths to process + subset_tasks = [] + for path in cfg.hf_paths: + # HF Path form: hf://[][@]/ + hf_path = f"hf://datasets/{cfg.hf_repo_id}@{cfg.hf_revision}/{path}" + logger.info(f"Processing subset {hf_path}") + output_path = os.path.join(cfg.output_path, path) + subset_tasks.append({"hf_path": hf_path, "output_path": output_path}) + + # Build pipeline with nested parallelism: + # - Outer level: process subsets (MAX_CONCURRENT_WORKERS=1) + # - Inner level: process files within each subset + pipeline = ( + Dataset.from_list(subset_tasks) + .flat_map(lambda task: get_file_tasks(task["hf_path"], task["output_path"], cfg.keep_columns)) + .map(lambda task: prune_stream_and_save(task["input_file"], task["output_file"], cfg.keep_columns)) + ) + + logger.info("Executing pipeline") + ctx = ZephyrContext(name="hf-remove-columns") + ctx.execute(pipeline) + logger.info("Successfully processed all subsets") diff --git a/lib/marin/src/marin/datakit/download/upload_gcs_to_hf.py b/lib/marin/src/marin/datakit/download/upload_gcs_to_hf.py new file mode 100644 index 0000000000..1aa580c618 --- /dev/null +++ b/lib/marin/src/marin/datakit/download/upload_gcs_to_hf.py @@ -0,0 +1,364 @@ +# Copyright The Marin Authors +# SPDX-License-Identifier: Apache-2.0 + +""" +Upload GCS to Hugging Face (HF) Script + +This script transfers model checkpoints or other content from Google Cloud Storage (GCS) +to Hugging Face repositories. It handles: +- Finding checkpoint directories in GCS buckets +- Downloading the content locally (to a temporary directory) +- Uploading to a specified Hugging Face repository with appropriate versioning +- Supporting dry-run mode to preview what would be uploaded + +Usage as a script: + python upload_gcs_to_hf.py --repo-id="organization/model-name" [--dry-run] [--directory="gs://bucket/path"] + +Usage as an ExecutorStep: + upload_step = ExecutorStep( + name="upload_model_to_hf", + fn=upload_gcs_to_hf, + config=UploadConfig( + hf_repo_id="organization/model-name", + gcs_directories=["gs://bucket/path/to/model"], + dry_run=False + ) + ) +""" + +import argparse +import logging +import os +import re +import subprocess +import tempfile +from dataclasses import dataclass, field + +from google.cloud import storage +from google.cloud.storage import transfer_manager +from huggingface_hub import HfApi, create_repo +from iris.logging import configure_logging + +# Set up logging +logger = logging.getLogger(__name__) + + +@dataclass +class UploadConfig: + """Configuration for uploading from GCS to Hugging Face.""" + + hf_repo_id: str + gcs_directories: list[str] = field(default_factory=list) + dry_run: bool = False + wait_for_completion: bool = True # Added for compatibility with other configs + + +# Default GCS directories to check if none specified +DEFAULT_GCS_DIRS = [ + "gs://marin-eu-west4/checkpoints/llama-8b-tootsie-0.001-19ad63/hf/", + "gs://marin-us-central2/checkpoints/llama-8b-tootsie-phase2/hf/", + "gs://marin-us-central2/checkpoints/llama-8b-tootsie-phase3/hf/", + "gs://marin-us-central2/checkpoints/tootsie-8b-soft-raccoon-3/hf/", + "gs://marin-us-central2/checkpoints/llama-8b-tootsie-adept-phoenix/hf/", + "gs://marin-us-central2/checkpoints/tootsie-8b-sensible-starling/hf/", + "gs://marin-us-central1/checkpoints/tootsie-8b-deeper-starling/hf/", +] + + +def list_gcs_directories(gcs_path: str) -> list[tuple[str, int]]: + """List subdirectories by examining full blob paths.""" + if not gcs_path.startswith("gs://"): + raise ValueError(f"Invalid GCS path: {gcs_path}") + + path = gcs_path[5:] # Remove "gs://" + bucket_name = path.split("/")[0] + prefix = "/".join(path.split("/")[1:]) + + logger.info(f"Checking: {gcs_path}") + + # Get the bucket + client = storage.Client() + bucket = client.bucket(bucket_name) + + # List blobs with this prefix (without delimiter to get all) + blobs = bucket.list_blobs(prefix=prefix) + + # Extract potential directories from blob paths + directories = set() + step_pattern = re.compile(r"step-\d+") + + for blob in blobs: + # Remove the prefix to get the relative path + relative_path = blob.name[len(prefix) :] + + # Skip if there's no relative path + if not relative_path: + continue + + # Extract the first directory level + parts = relative_path.strip("/").split("/") + if parts: + first_dir = parts[0] + + # Check if it's a step directory + if step_pattern.match(first_dir): + directories.add(first_dir) + + # Process the directories we found + step_dirs_local = [] + for dir_name in directories: + if step_pattern.match(dir_name): + try: + step_number = int(dir_name.split("-")[1]) + full_path = f"{gcs_path}{dir_name}/" + step_dirs_local.append((full_path, step_number)) + logger.info(f"Found step directory: {full_path} with step {step_number}") + except (IndexError, ValueError) as e: + logger.error(f"Error parsing step number from {dir_name}: {e}") + + logger.info(f"Found {len(step_dirs_local)} step directories in {gcs_path}") + return step_dirs_local + + +def download_from_gcs(gcs_path: str, local_path: str) -> bool: + """Download contents from a GCS path to a local directory using the GCS transfer manager.""" + logger.info(f"Downloading {gcs_path} to {local_path}...") + + # Parse the GCS path (format: gs://bucket-name/path/to/files) + if not gcs_path.startswith("gs://"): + logger.error(f"Invalid GCS path format: {gcs_path}") + return False + + bucket_name = gcs_path[5:].split("/")[0] + prefix = "/".join(gcs_path[5:].split("/")[1:]) + + # Handle wildcard at the end (the original had f"{gcs_path}*") + if prefix.endswith("*"): + prefix = prefix[:-1] + + # Initialize the GCS client + client = storage.Client() + bucket = client.bucket(bucket_name) + + # List all matching blobs + blobs = list(bucket.list_blobs(prefix=prefix)) + + if not blobs: + logger.error(f"No files found in {gcs_path}") + return False + + total_files = len(blobs) + logger.info(f"Found {total_files} files to download from {gcs_path}") + + # Get the blob names to download (excluding directory placeholders) + blob_names = [] + for blob in blobs: + if not blob.name.endswith("/"): + blob_names.append(blob.name) + + if len(blob_names) < total_files: + logger.info(f"Filtered out {total_files - len(blob_names)} directory markers") + + # Ensure local directory exists + os.makedirs(local_path, exist_ok=True) + + # Log the first few blob names to debug issues + if blob_names: + logger.info(f"Sample blob names (first 3): {', '.join(blob_names[:3])}") + + # Use transfer manager to download all blobs in parallel + logger.info(f"Starting parallel download of {len(blob_names)} files...") + + transfer_manager.download_many_to_path( + bucket=bucket, + blob_names=blob_names, + destination_directory=local_path, + max_workers=8, + create_directories=True, + worker_type="process", + raise_exception=True, + ) + + logger.info(f"Download completed successfully. Downloaded {len(blob_names)} files.") + return True + + +def checkpoint_exists(repo_id: str, step: int, version_name: str) -> bool: + """Check if a specific revision exists in a Hugging Face repository.""" + try: + api = HfApi() + commits = api.list_repo_commits(repo_id=repo_id) + for commit in commits: + if f"step {step}" in commit.title: + return True + return False + except Exception: + return False + + +def extract_version_from_path(gcs_path: str) -> str: + """Extract the version name from a GCS path.""" + # Extract model name from path like "gs://marin-eu-west4/checkpoints/llama-8b-tootsie-0.001-19ad63/hf/" + parts = gcs_path.strip("/").split("/") + return parts[-3] + + +def upload_to_huggingface(local_path: str, repo_id: str, step: int, version_name: str) -> bool: + """Upload a local directory to Hugging Face as a specific revision.""" + logger.info(f"Uploading checkpoint {version_name}, step {step} to Hugging Face") + + # Check if repo exists, create if not + api = HfApi() + create_repo(repo_id=repo_id, exist_ok=True) + # Upload the directory + result = api.upload_folder( + folder_path=local_path, + repo_id=repo_id, + commit_message=f"Upload checkpoint for step {step} ({version_name})", + ) + try: + api.delete_tag(repo_id=repo_id, tag=version_name) + except Exception: + logger.info("Creating tag for the first time") + api.create_tag(repo_id=repo_id, tag=version_name) + logger.info("Upload completed successfully.") + logger.info(f"Commit URL: {result.commit_url}") + return True + + +def upload_gcs_to_hf(cfg: UploadConfig) -> None: + """Main function to upload model checkpoints from GCS to Hugging Face.""" + + configure_logging(level=logging.INFO) + + # Collect all step directories + all_step_dirs = [] + + # Determine which directories to process + directories_to_process = cfg.gcs_directories if cfg.gcs_directories else DEFAULT_GCS_DIRS + + # Process each directory + for directory in directories_to_process: + try: + step_dirs = list_gcs_directories(directory) + all_step_dirs.extend(step_dirs) + except Exception as e: + logger.error(f"Error listing {directory}: {e}") + + # Sort all step directories by step number + if all_step_dirs: + all_step_dirs.sort(key=lambda x: x[1]) + + # Print sorted step directories + logger.info("\nAll step directories sorted by step number:") + logger.info("-" * 50) + for full_path, _step_number in all_step_dirs: + logger.info(f"- {full_path}") + + logger.info(f"\nTotal: {len(all_step_dirs)} step directories") + + # Upload to Hugging Face + if not cfg.dry_run: + logger.info(f"\nUploading to Hugging Face repo: {cfg.hf_repo_id}") + + for full_path, step_number in all_step_dirs: + # Extract version name from the path + version_name = extract_version_from_path(full_path) + + # Check if this checkpoint already exists + if checkpoint_exists(cfg.hf_repo_id, step_number, version_name): + logger.info( + f"Step {step_number} for {version_name} already exists in HF repo {cfg.hf_repo_id}, skipping" + ) + continue + + # Create a temporary directory for downloading + with tempfile.TemporaryDirectory() as temp_dir: + logger.info(f"\nProcessing step {step_number} from {full_path} ({version_name})") + + # Download from GCS + if download_from_gcs(full_path, temp_dir): + # Upload to HF + if upload_to_huggingface(temp_dir, cfg.hf_repo_id, step_number, version_name): + logger.info( + f"Successfully uploaded step {step_number} ({version_name}) to HF repo {cfg.hf_repo_id}" + ) + else: + logger.error(f"Failed to upload step {step_number}") + else: + logger.error(f"Failed to download step {step_number}") + + logger.info("\nUpload process completed.") + else: + logger.info("\nDry run - showing what would be uploaded:") + logger.info("-" * 50) + + for i, (full_path, step_number) in enumerate(all_step_dirs): + version_name = extract_version_from_path(full_path) + logger.info(f"\nCheckpoint {i + 1}/{len(all_step_dirs)}:") + logger.info(f" Source: {full_path}") + logger.info(f" Target repo: {cfg.hf_repo_id}") + logger.info(f" Revision: {version_name}") + logger.info(f" Commit message: Upload checkpoint for step {step_number} ({version_name})") + + # Try to estimate what files would be uploaded + try: + # Use gsutil to list files in the directory + cmd = ["gsutil", "ls", f"{full_path}"] + result = subprocess.run(cmd, capture_output=True, text=True) + if result.returncode == 0: + files = result.stdout.strip().split("\n") + # Filter out empty strings and limit to 5 for display + files = [f for f in files if f] + + if files: + logger.info( + f" Example files that would be uploaded ({min(len(files), 5)} of {len(files)}):" + ) + for file in files[:5]: + logger.info(f" - {os.path.basename(file)}") + if len(files) > 5: + logger.info(f" - ... and {len(files) - 5} more") + except Exception as e: + logger.error(f" Could not list files: {e}") + + logger.info("\nDry run completed - no actual uploads performed.") + else: + logger.warning("\nNo step directories found in any of the paths.") + logger.warning("You might want to check if:") + logger.warning("1. The paths are correct") + logger.warning("2. You have permissions to access these buckets") + logger.warning("3. There are step directories in these locations") + + +def main(): + """Command line entry point for direct script usage.""" + parser = argparse.ArgumentParser(description="Upload checkpoints from GCS to Hugging Face") + parser.add_argument( + "--repo-id", required=True, help='Target Hugging Face repository ID (e.g., "username/model-name")' + ) + parser.add_argument("--dry-run", action="store_true", help="Only list checkpoints without uploading") + parser.add_argument( + "--directories", + nargs="+", + help="Process specific GCS directories instead of the built-in list. Multiple directories can be provided.", + ) + args = parser.parse_args() + + # Create config from args + config = UploadConfig( + hf_repo_id=args.repo_id, gcs_directories=args.directories if args.directories else [], dry_run=args.dry_run + ) + + # Check if application default credentials are set + if "GOOGLE_APPLICATION_CREDENTIALS" not in os.environ: + logger.warning("Warning: GOOGLE_APPLICATION_CREDENTIALS environment variable not set.") + logger.warning("Make sure you're authenticated with Google Cloud before running this script.") + logger.warning("You can authenticate using: gcloud auth application-default login") + + # Run the upload function + upload_gcs_to_hf(config) + + +if __name__ == "__main__": + main() From 0a1413a628c1a3c9394e6893fe5db705f1db0a70 Mon Sep 17 00:00:00 2001 From: Rafal Wojdyla Date: Wed, 25 Mar 2026 10:08:58 -0700 Subject: [PATCH 05/56] Move specialized downloaders to datakit/download/ with StepSpec factories Moves nemotron_cc, uncheatable_eval, ar5iv, dclm_hq, wikipedia, and filesystem modules into datakit/download/. Each module gains a *_step() factory returning StepSpec. Renames ambiguous DownloadConfig classes to Ar5ivDownloadConfig and WikipediaDownloadConfig. The uncheatable_eval make_uncheatable_eval_step() is preserved as a compat wrapper around the new uncheatable_eval_step(). Co-Authored-By: Claude Opus 4.6 (1M context) --- lib/marin/src/marin/datakit/download/ar5iv.py | 160 +++++++ .../src/marin/datakit/download/dclm_hq.py | 232 ++++++++++ .../src/marin/datakit/download/filesystem.py | 101 ++++ .../src/marin/datakit/download/nemotron_cc.py | 142 ++++++ .../datakit/download/uncheatable_eval.py | 438 ++++++++++++++++++ .../src/marin/datakit/download/wikipedia.py | 150 ++++++ 6 files changed, 1223 insertions(+) create mode 100644 lib/marin/src/marin/datakit/download/ar5iv.py create mode 100644 lib/marin/src/marin/datakit/download/dclm_hq.py create mode 100644 lib/marin/src/marin/datakit/download/filesystem.py create mode 100644 lib/marin/src/marin/datakit/download/nemotron_cc.py create mode 100644 lib/marin/src/marin/datakit/download/uncheatable_eval.py create mode 100644 lib/marin/src/marin/datakit/download/wikipedia.py diff --git a/lib/marin/src/marin/datakit/download/ar5iv.py b/lib/marin/src/marin/datakit/download/ar5iv.py new file mode 100644 index 0000000000..86498e12e1 --- /dev/null +++ b/lib/marin/src/marin/datakit/download/ar5iv.py @@ -0,0 +1,160 @@ +# Copyright The Marin Authors +# SPDX-License-Identifier: Apache-2.0 + +""" +Download and process Ar5iv dataset from a zip file. + +Example Usage: +uv run zephyr --backend=ray --max-parallelism=1000 --memory=10GB \ + lib/marin/src/marin/download/ar5iv/download.py \ + --input_path gs://bucket/ar5iv.zip \ + --output_path gs://bucket/output +""" + +import json +import logging +import zipfile +from collections import defaultdict +from dataclasses import dataclass + +import draccus +from iris.marin_fs import open_url +from marin.execution.step_spec import StepSpec +from zephyr import Dataset, ZephyrContext +from zephyr.writers import atomic_rename +from iris.logging import configure_logging + +logger = logging.getLogger(__name__) + + +@dataclass +class Ar5ivDownloadConfig: + input_path: str + output_path: str + max_files: int | None = None # Maximum number of shards to process + + +def process_shard(shard_task: dict) -> dict: + """ + Process a single shard by extracting its files from the zip in GCS and uploading the merged JSONL. + + Args: + shard_task: Dict with keys 'input_path', 'output_path', 'shard_id', 'file_list' + """ + input_path = shard_task["input_path"] + output_path = shard_task["output_path"] + shard_id = shard_task["shard_id"] + file_list = shard_task["file_list"] + gcs_path = f"{output_path}/{shard_id}.jsonl.gz" + + with open_url(str(input_path), "rb") as f: + with zipfile.ZipFile(f) as zf: + with atomic_rename(gcs_path) as temp_path, open_url(temp_path, "wt", compression="gzip") as out_f: + for filename in file_list: + with zf.open(filename, "r") as file_handle: + content = file_handle.read() + record = { + "filename": filename, + "format": "html", + "content": content.decode("utf-8", errors="replace"), + } + print(json.dumps(record), file=out_f) + + logger.info(f"Shard {shard_id} with {len(file_list)} files uploaded to {gcs_path}") + return {"shard_id": shard_id, "num_files": len(file_list), "output_path": gcs_path} + + +def download(cfg: Ar5ivDownloadConfig) -> None: + """ + Download and process Ar5iv dataset from a zip file in GCS. + + This function can be called by the executor framework or used standalone. + """ + logger.info("Starting transfer of Ar5iv dataset...") + logger.info(f"Source: {cfg.input_path}") + + # Use fsspec+zipfile to list all files + with open_url(str(cfg.input_path), "rb") as f: + with zipfile.ZipFile(f) as zf: + all_files = zf.infolist() + + # Group by shard directory + # We assume structure: something like: shard_id/.../file + # shard_id is derived from the second last component if files are nested. + # Adjust as needed if directory structure differs. + shard_dict = defaultdict(list) + for info in all_files: + if info.is_dir(): + continue + # E.g. path might look like: "003/something.html" + # Extract shard_id from the directory: + # Split by "/" and take the first part if we assume structure {shard_id}/file + parts = info.filename.strip("/").split("/") + if len(parts) < 2: + # File at root level - decide how to handle this case. + # If no directory structure is given, skip or treat differently. + continue + shard_id = parts[-2] # get the second-last directory as shard_id + shard_dict[shard_id].append(info.filename) + + # Apply max_files limit if provided + shard_ids = list(shard_dict.keys()) + if cfg.max_files is not None: + shard_ids = shard_ids[: cfg.max_files] + + logger.info(f"Found {len(shard_ids)} shards to process.") + + # Build task list for each shard + shard_tasks = [] + for shard_id in shard_ids: + shard_tasks.append( + { + "input_path": cfg.input_path, + "output_path": cfg.output_path, + "shard_id": shard_id, + "file_list": shard_dict[shard_id], + } + ) + + # Execute pipeline with zephyr + pipeline = ( + Dataset.from_list(shard_tasks) + .map(process_shard) + .write_jsonl(f"{cfg.output_path}/.metrics/part-{{shard:05d}}.jsonl", skip_existing=True) + ) + ctx = ZephyrContext(name="download-ar5iv") + ctx.execute(pipeline) + + logger.info("Transfer completed successfully!") + + +def ar5iv_step( + name: str = "raw/ar5iv", + *, + input_path: str, + max_files: int | None = None, + deps: list[StepSpec] | None = None, + output_path_prefix: str | None = None, + override_output_path: str | None = None, +) -> StepSpec: + """Create a StepSpec that downloads and processes the Ar5iv dataset from a zip file.""" + + def _run(output_path: str) -> None: + download(Ar5ivDownloadConfig(input_path=input_path, output_path=output_path, max_files=max_files)) + + return StepSpec( + name=name, + fn=_run, + deps=deps or [], + hash_attrs={"input_path": input_path, "max_files": max_files}, + output_path_prefix=output_path_prefix, + override_output_path=override_output_path, + ) + + +@draccus.wrap() +def main(cfg: Ar5ivDownloadConfig) -> None: + """CLI entrypoint for downloading and processing Ar5iv dataset.""" + + configure_logging(level=logging.INFO) + download(cfg) diff --git a/lib/marin/src/marin/datakit/download/dclm_hq.py b/lib/marin/src/marin/datakit/download/dclm_hq.py new file mode 100644 index 0000000000..83c127c079 --- /dev/null +++ b/lib/marin/src/marin/datakit/download/dclm_hq.py @@ -0,0 +1,232 @@ +# Copyright The Marin Authors +# SPDX-License-Identifier: Apache-2.0 + +""" +Download DCLM HQ HTML data by fetching HTML content from Common Crawl. + +Processes DCLM HQ JSONL files and enriches them with HTML content fetched from Common Crawl +via a custom index server. Uses zephyr for parallel processing with flattened parallelism. + +Example Usage: +uv run zephyr --backend=ray --max-parallelism=800 --memory=2GB \ + lib/marin/src/marin/download/dclm_hq/download_dclm_hq_html.py \ + --input_path gs://marin-us-central2/raw/dclm-baseline-1.0-parquet/global/ \ + --output_path gs://marin-data/processed/dclm-hq-html/ +""" + +import io +import json +import logging +import os +import re +from dataclasses import dataclass + +import requests +from iris.marin_fs import open_url +from marin.execution.step_spec import StepSpec +import warcio +from marin.utils import fsspec_glob +from tqdm import tqdm +from zephyr import Dataset, ZephyrContext +from zephyr.writers import ensure_parent_dir + +CC_IDX_HOST_URL = "http://34.72.201.218:8080" +logger = logging.getLogger(__name__) + + +@dataclass +class DCLMHQDownloadConfig: + input_path: str + output_path: str + + +@dataclass +class FileTask: + """Represents a single file processing task.""" + + input_file_path: str + output_file_path: str + + +def fetch_warc_from_cc(s3_warc_path: str, length: int, offset: int) -> str: + """ + Fetch a WARC record from Common Crawl S3 bucket using byte range requests we get + from the CC index via `find_html_in_cc`. + Args: + s3_warc_path: Path to WARC file in S3 bucket + length: Length of the record in bytes + offset: Byte offset of the record in the WARC file + Returns: + The WARC record content as a string + """ + # Convert string values to integers + offset = int(offset) + length = int(length) + + # Make range request to CommonCrawl + response = requests.get( + f"https://data.commoncrawl.org/{s3_warc_path}", headers={"Range": f"bytes={offset}-{offset + length - 1}"} + ) + response.raise_for_status() + + # Parse WARC record and extract HTML content + with io.BytesIO(response.content) as stream: + for record in warcio.ArchiveIterator(stream): + content = record.content_stream().read() + return content.decode(errors="ignore") + + raise ValueError(f"No WARC records found in response from {s3_warc_path}") + + +def find_html_in_cc(split_id: str, target_uri: str) -> str | None: + """ + We host our own index of the Common Crawl over GCP which we use in this function. + For each call we receive a list of chunks that contain the HTML content for the given target URI. + We then fetch each chunk and concatenate them together to form the complete HTML content. + Args: + split_id: The split ID of the Common Crawl + target_uri: The target URI to find the HTML content for + Returns: + The HTML content as a string + """ + resp = requests.get(f"{CC_IDX_HOST_URL}/{split_id}-index?url={target_uri}&output=json") + + resp.raise_for_status() + + chunks = [json.loads(chunk) for chunk in resp.text.split("\n") if chunk] + sorted_chunks = sorted(chunks, key=lambda x: x["offset"]) + + html_content = "" + + for chunk in sorted_chunks: + warc_path = chunk["filename"] + length = chunk["length"] + offset = chunk["offset"] + + warc_record = fetch_warc_from_cc(warc_path, length, offset) + + html_content += warc_record + + return html_content + + +def process_file(task: FileTask) -> None: + """Process a single DCLM file, fetching HTML from Common Crawl. + + Args: + task: FileTask containing input and output file paths + """ + logger.info(f"Starting processing of file {task.input_file_path}") + logger.info(f"Source: {task.input_file_path}") + logger.info(f"Destination: {task.output_file_path}") + try: + ensure_parent_dir(task.output_file_path) + with ( + open_url(task.input_file_path, compression="zstd") as source, + open_url(task.output_file_path, "wt", compression="gzip") as output, + ): + text_wrapper = io.TextIOWrapper(source, encoding="utf-8") + + for line in tqdm(text_wrapper, desc="Processing lines"): + row = json.loads(line.strip()) + + # We need to extract the split from where the record was for querying the index + # The only place we have this information is in the warcinfo key in DCLM HQ + # The format is: + # warc-type: WARC/1.1 + # ... + # isPartOf: CC-MAIN-2024-01 + # This however is a string and not a key-value pair, so we need to extract + # the split from it via regex pattern `isPartOf:\s*(CC-MAIN-\d{4}-\d{2})`. + # This pattern groups the value of the key `isPartOf` that is of the form + # `CC-MAIN-xxxx-xx` where `xxxx` is a year and `xx` is a month. + match = re.search(r"isPartOf:\s*(CC-MAIN-\d{4}-\d{2})", row["metadata"]["warcinfo"]) + if match is None: + logger.error(f"No split found for record ID: {row['metadata']['WARC-Record-ID']}") + continue + + is_part_of = match.group(1) + + try: + html_string = find_html_in_cc(is_part_of, row["metadata"]["WARC-Target-URI"]) + + if html_string is None: + logger.error(f"No HTML found for record ID: {row['metadata']['WARC-Record-ID']}") + continue + + if "text" in row: + row.pop("text") + + row["html"] = html_string + + print(json.dumps(row), file=output) + except Exception as e: + logger.exception(f"Error processing line: {e}") + continue + + logger.info("\nProcessing completed successfully!") + logger.info(f"File available at: {task.output_file_path}") + + except Exception as e: + logger.error(f"Error during processing: {e}") + raise + + +def extract_dclm_hq_dump(cfg: DCLMHQDownloadConfig) -> None: + """Process the DCLM HQ dump in the input path and save the results to the output path. + + Flattens the nested directory structure (shards → files) into a single list of files + and processes them in parallel using zephyr. + """ + logger.info(f"Starting processing of DCLM HQ dump in {cfg.input_path}") + + # Flatten nested structure: discover all files upfront + all_files = [] + paths = [i.split("/")[-1] for i in fsspec_glob(os.path.join(cfg.input_path, "*"))] + + logger.info(f"Found {len(paths)} shards to process") + + for path in paths: + input_path = os.path.join(cfg.input_path, path) + shard_paths = fsspec_glob(os.path.join(input_path, "*.json.zst")) + + for shard_path in shard_paths: + input_file_path = shard_path + output_file_path = os.path.join(cfg.output_path, path, os.path.basename(shard_path)).replace( + ".json.zst", ".jsonl.gz" + ) + + all_files.append(FileTask(input_file_path=input_file_path, output_file_path=output_file_path)) + + logger.info(f"Found {len(all_files)} files to process") + + # Single-level parallelism over all files + pipeline = Dataset.from_list(all_files).map(process_file) + + ctx = ZephyrContext(name="download-dclm-html") + ctx.execute(pipeline) + + logger.info("Processing completed successfully!") + + +def dclm_hq_step( + name: str = "raw/dclm-hq-html", + *, + input_path: str, + deps: list[StepSpec] | None = None, + output_path_prefix: str | None = None, + override_output_path: str | None = None, +) -> StepSpec: + """Create a StepSpec that downloads DCLM HQ HTML data from Common Crawl.""" + + def _run(output_path: str) -> None: + extract_dclm_hq_dump(DCLMHQDownloadConfig(input_path=input_path, output_path=output_path)) + + return StepSpec( + name=name, + fn=_run, + deps=deps or [], + hash_attrs={"input_path": input_path}, + output_path_prefix=output_path_prefix, + override_output_path=override_output_path, + ) diff --git a/lib/marin/src/marin/datakit/download/filesystem.py b/lib/marin/src/marin/datakit/download/filesystem.py new file mode 100644 index 0000000000..287426666f --- /dev/null +++ b/lib/marin/src/marin/datakit/download/filesystem.py @@ -0,0 +1,101 @@ +# Copyright The Marin Authors +# SPDX-License-Identifier: Apache-2.0 + +import os +import random +import time +from dataclasses import dataclass + +from iris.marin_fs import url_to_fs +from marin.execution.step_spec import StepSpec +from zephyr import Dataset, ZephyrContext + +from marin.utils import fsspec_exists, fsspec_glob + + +@dataclass +class TransferConfig: + input_path: str + output_path: str + + # Selectively choose the number of random files to transfer. None means all files + num_random_files: int | None = None + filetype: str = "jsonl.zst" + + +def transfer_files(config: TransferConfig) -> None: + """Transfers files from the input path to the output path. + + When num_random_files is None, copies the entire directory recursively. + When num_random_files is specified, randomly samples that many files and + copies them in parallel using zephyr. + """ + if config.input_path.endswith("/"): + input_path = config.input_path[:-1] + else: + input_path = config.input_path + + print(f"Downloading {input_path} from GCS.") + start_time: float = time.time() + fs, _ = url_to_fs(input_path) + if not fs.exists(input_path): + raise FileNotFoundError(f"{input_path} does not exist.") + + # Glob all matching files + filenames = fsspec_glob(os.path.join(input_path, f"**/*.{config.filetype}")) + + # Select files: either random sample or all files + if config.num_random_files is None: + selected_files = filenames + else: + random.seed(42) + random.shuffle(filenames) + selected_files = filenames[: config.num_random_files] + + def copy_file(filename: str) -> None: + """Copy a single file if it doesn't already exist at destination.""" + output_filename = os.path.join(config.output_path, os.path.basename(filename)) + if not fsspec_exists(output_filename): + # Ensure output directory exists + fs.makedirs(config.output_path, exist_ok=True) + fs.copy(filename, output_filename) + + # Always use parallel copying via zephyr + pipeline = Dataset.from_list(selected_files).map(copy_file) + ctx = ZephyrContext(name="fs-transfer") + ctx.execute(pipeline) + + elapsed_time_seconds: float = time.time() - start_time + print(f"Downloaded {input_path} to {config.output_path} ({elapsed_time_seconds}s).") + + +def transfer_step( + name: str, + *, + input_path: str, + num_random_files: int | None = None, + filetype: str = "jsonl.zst", + deps: list[StepSpec] | None = None, + output_path_prefix: str | None = None, + override_output_path: str | None = None, +) -> StepSpec: + """Create a StepSpec that transfers files between fsspec paths.""" + + def _run(output_path: str) -> None: + transfer_files( + TransferConfig( + input_path=input_path, + output_path=output_path, + num_random_files=num_random_files, + filetype=filetype, + ) + ) + + return StepSpec( + name=name, + fn=_run, + deps=deps or [], + hash_attrs={"input_path": input_path, "num_random_files": num_random_files, "filetype": filetype}, + output_path_prefix=output_path_prefix, + override_output_path=override_output_path, + ) diff --git a/lib/marin/src/marin/datakit/download/nemotron_cc.py b/lib/marin/src/marin/datakit/download/nemotron_cc.py new file mode 100644 index 0000000000..4b32983091 --- /dev/null +++ b/lib/marin/src/marin/datakit/download/nemotron_cc.py @@ -0,0 +1,142 @@ +# Copyright The Marin Authors +# SPDX-License-Identifier: Apache-2.0 + +""" +Download and process Nemotron-CC dataset from Common Crawl. + +Example Usage: +uv run zephyr --backend=ray --max-parallelism=100 --memory=4GB \ + lib/marin/src/marin/download/nemotron_cc/download_nemotron_cc.py \ + --output_path gs://bucket/nemotron-output +""" + +import json +import logging +import os +from collections.abc import Iterator +from dataclasses import dataclass + +import requests +import zstandard +from iris.marin_fs import open_url +from marin.execution import THIS_OUTPUT_PATH +from marin.execution.step_spec import StepSpec +from marin.utils import fsspec_exists +from requests.adapters import HTTPAdapter +from urllib3.util import Retry +from zephyr import Dataset, ZephyrContext +from zephyr.writers import atomic_rename + +logger = logging.getLogger(__name__) + +myagent = "marin-nemotron-ingress/1.0" +NCC_PATH_FILE_URL = "https://data.commoncrawl.org/contrib/Nemotron/Nemotron-CC/data-jsonl.paths.gz" + + +def _iter_jsonl_from_zstd_stream(raw_stream) -> Iterator[dict]: + """Yield parsed JSON objects from a zstd-compressed JSONL stream.""" + dctx = zstandard.ZstdDecompressor() + with dctx.stream_reader(raw_stream) as reader: + buf = bytearray() + while True: + chunk = reader.read(1048576) + if not chunk: + break + buf.extend(chunk) + while True: + newline_pos = buf.find(b"\n") + if newline_pos < 0: + break + line_bytes = bytes(buf[:newline_pos]) + del buf[: newline_pos + 1] + if not line_bytes.strip(): + continue + yield json.loads(line_bytes) + + +def download_single_nemotron_path(input_file_path: str, output_file_path: str) -> dict: + """Fetches content from a Common Crawl path, streaming records to zstd output.""" + cc_url = f"https://data.commoncrawl.org/{input_file_path}" + logger.info(f"Downloading Nemotron CC file {cc_url} to {output_file_path}") + + session = requests.Session() + retries = Retry(total=5, backoff_factor=1.0, status_forcelist=[500, 502, 503, 504], allowed_methods=["GET"]) + adapter = HTTPAdapter(max_retries=retries) + session.mount("https://", adapter) + session.mount("http://", adapter) + + response = session.get(cc_url, headers={"user-agent": myagent}, stream=True) + response.raise_for_status() + + num_records = 0 + with atomic_rename(output_file_path) as temp_path: + with open_url(temp_path, "w", compression="zstd") as out: + for record in _iter_jsonl_from_zstd_stream(response.raw): + dolma_record = { + "id": record["warc_record_id"], + "text": record["text"], + "source": "nemotron", + "format": "text", + "metadata": {f"nemotron_{k}": v for k, v in record.items() if k not in ("warc_record_id", "text")}, + } + print(json.dumps(dolma_record), file=out) + num_records += 1 + + return {"input_file": input_file_path, "output_file": output_file_path, "num_records": num_records} + + +@dataclass +class NemotronIngressConfig: + output_path: str = THIS_OUTPUT_PATH + + +def download_nemotron_cc(cfg: NemotronIngressConfig): + paths_file_path = os.path.join(cfg.output_path, "data-jsonl.paths") + logger.info(f"Downloading Nemotron CC path file {paths_file_path}") + + with open_url(NCC_PATH_FILE_URL, "rb") as f, open_url(paths_file_path, "wb") as f_out: + f_out.write(f.read()) + + logger.info(f"Reading paths from {paths_file_path}") + all_files = [] + with open_url(paths_file_path, "r", compression="gzip") as f: + for line in f: + file = line.strip() + output_file_path = os.path.join(cfg.output_path, file).replace("jsonl.zstd", "jsonl.zst") + all_files.append((file, output_file_path)) + + logger.info(f"Processing {len(all_files)} Nemotron CC files") + + pipeline = ( + Dataset.from_list(all_files) + .filter(lambda file_info: not fsspec_exists(file_info[1])) + .map(lambda file_info: download_single_nemotron_path(*file_info)) + .write_jsonl(os.path.join(cfg.output_path, ".metrics/download-{shard:05d}.jsonl"), skip_existing=True) + ) + + ctx = ZephyrContext(name="download-nemotron-cc") + ctx.execute(pipeline) + + logger.info(f"Downloaded Nemotron CC files to {cfg.output_path}") + + +def nemotron_cc_step( + name: str = "raw/nemotron-cc", + *, + deps: list[StepSpec] | None = None, + output_path_prefix: str | None = None, + override_output_path: str | None = None, +) -> StepSpec: + """Create a StepSpec that downloads the Nemotron-CC dataset from Common Crawl.""" + + def _run(output_path: str) -> None: + download_nemotron_cc(NemotronIngressConfig(output_path=output_path)) + + return StepSpec( + name=name, + fn=_run, + deps=deps or [], + hash_attrs={}, + output_path_prefix=output_path_prefix, + override_output_path=override_output_path, + ) diff --git a/lib/marin/src/marin/datakit/download/uncheatable_eval.py b/lib/marin/src/marin/datakit/download/uncheatable_eval.py new file mode 100644 index 0000000000..0bcdef3439 --- /dev/null +++ b/lib/marin/src/marin/datakit/download/uncheatable_eval.py @@ -0,0 +1,438 @@ +# Copyright The Marin Authors +# SPDX-License-Identifier: Apache-2.0 + +"""Download and normalize the latest Uncheatable Eval data dumps.""" + +from __future__ import annotations + +import json +import logging +import os +import posixpath +import re +from collections.abc import Iterable +from dataclasses import dataclass +from typing import Any + +import requests +from iris.marin_fs import open_url +from marin.execution import THIS_OUTPUT_PATH, ExecutorStep, VersionedValue +from marin.execution.step_spec import StepSpec +from marin.utils import fsspec_mkdirs +from requests.adapters import HTTPAdapter +from urllib3.util import Retry +from zephyr import Dataset, ZephyrContext +from zephyr.writers import atomic_rename + +logger = logging.getLogger(__name__) + +FILENAME_PATTERN = re.compile(r"^(?P.+)_(?P\d{8})to(?P\d{8})(?P(?:\.[^.]+)*)$") + +TEXT_FIELD_CANDIDATES: tuple[str, ...] = ( + "text", + "body", + "content", + "article", + "document", + "raw_text", + "code", + "message", + "description", + "story", +) + +LIST_FIELD_CANDIDATES: tuple[str, ...] = ( + "paragraphs", + "sentences", + "lines", + "messages", +) + +ID_FIELD_CANDIDATES: tuple[str, ...] = ( + "id", + "uuid", + "guid", + "doc_id", + "document_id", + "article_id", + "hash", + "sha", + "uid", +) + + +@dataclass(frozen=True) +class UncheatableEvalDataset: + """Information about a single data dump file from the Uncheatable Eval repository.""" + + benchmark: str + start_date: str + end_date: str + name: str + download_url: str + sha: str | None = None + size: int | None = None + + @property + def date_range(self) -> str: + return f"{self.start_date}to{self.end_date}" + + @property + def source_label(self) -> str: + return f"{self.benchmark}:{self.date_range}" + + def output_filename(self, suffix: str = ".jsonl.gz") -> str: + return f"{self.benchmark}_{self.date_range}{suffix}" + + +@dataclass +class UncheatableEvalDownloadConfig: + """Configuration for downloading and normalizing Uncheatable Eval dumps.""" + + output_path: str | VersionedValue[str] = THIS_OUTPUT_PATH + repo_owner: str | VersionedValue[str] = "Jellyfish042" + repo_name: str | VersionedValue[str] = "uncheatable_eval" + data_path: str | VersionedValue[str] = "data" + branch: str | VersionedValue[str] = "master" + max_concurrent_downloads: int = 8 + request_timeout: int = 120 + github_token: str | None = None + skip_existing: bool = True + metadata_filename: str = "metadata.json" + + +def _http_headers(cfg: UncheatableEvalDownloadConfig) -> dict[str, str]: + headers = {"Accept": "application/vnd.github+json"} + token = cfg.github_token or os.environ.get("GITHUB_TOKEN") + if token: + headers["Authorization"] = f"Bearer {token}" + return headers + + +def _fetch_directory_listing(cfg: UncheatableEvalDownloadConfig) -> list[dict[str, Any]]: + """Return the list of files in the configured GitHub repository directory.""" + + headers = _http_headers(cfg) + base_url = f"https://api.github.com/repos/{cfg.repo_owner!s}/{cfg.repo_name!s}/contents/{cfg.data_path!s}" + params = {"ref": str(cfg.branch)} + response = requests.get(base_url, headers=headers, params=params, timeout=cfg.request_timeout) + response.raise_for_status() + payload = response.json() + if not isinstance(payload, list): + raise ValueError(f"Unexpected response from GitHub API: {payload!r}") + return payload + + +def _parse_available_dumps(entries: Iterable[dict[str, Any]]) -> list[UncheatableEvalDataset]: + """Parse GitHub directory entries into dataset metadata.""" + + datasets: list[UncheatableEvalDataset] = [] + for entry in entries: + name = entry.get("name") + if not isinstance(name, str): + continue + match = FILENAME_PATTERN.match(name) + if not match: + continue + benchmark = match.group("benchmark") + start = match.group("start") + end = match.group("end") + download_url = entry.get("download_url") + if not isinstance(download_url, str): + logger.debug("Skipping %s because it has no download_url", name) + continue + datasets.append( + UncheatableEvalDataset( + benchmark=benchmark, + start_date=start, + end_date=end, + name=name, + download_url=download_url, + sha=entry.get("sha"), + size=entry.get("size"), + ) + ) + return datasets + + +def _select_latest_dumps(datasets: Iterable[UncheatableEvalDataset]) -> list[UncheatableEvalDataset]: + """Select the latest dump for each benchmark based on the end date (and start date as tie breaker).""" + + latest: dict[str, UncheatableEvalDataset] = {} + for dataset in datasets: + existing = latest.get(dataset.benchmark) + if existing is None: + latest[dataset.benchmark] = dataset + continue + candidate_key = (dataset.end_date, dataset.start_date, dataset.name) + existing_key = (existing.end_date, existing.start_date, existing.name) + if candidate_key > existing_key: + latest[dataset.benchmark] = dataset + return sorted(latest.values(), key=lambda d: d.benchmark) + + +def _extract_id(raw: Any, dataset: UncheatableEvalDataset, index: int) -> str: + if isinstance(raw, dict): + for key in ID_FIELD_CANDIDATES: + value = raw.get(key) + if value: + return str(value) + metadata = raw.get("metadata") + if isinstance(metadata, dict): + for key in ID_FIELD_CANDIDATES: + value = metadata.get(key) + if value: + return str(value) + return f"{dataset.benchmark}_{dataset.date_range}_{index:06d}" + + +def _join_list_field(value: Any) -> str | None: + if isinstance(value, list): + text_items = [str(item) for item in value if item is not None] + if text_items: + return "\n".join(text_items) + return None + + +def _extract_text(raw: Any) -> str | None: + if raw is None: + return None + if isinstance(raw, str): + return raw + if isinstance(raw, dict): + for key in TEXT_FIELD_CANDIDATES: + value = raw.get(key) + if isinstance(value, str) and value.strip(): + return value + for key in TEXT_FIELD_CANDIDATES: + value = raw.get(key) + joined = _join_list_field(value) + if joined: + return joined + for key in LIST_FIELD_CANDIDATES: + joined = _join_list_field(raw.get(key)) + if joined: + return joined + title = raw.get("title") + body = raw.get("body") + if isinstance(title, str) and isinstance(body, str): + combined = f"{title.strip()}\n\n{body.strip()}" + if combined.strip(): + return combined + if isinstance(title, str) and title.strip(): + return title + return json.dumps(raw, ensure_ascii=False) + return str(raw) + + +def _normalize_record(raw: Any, dataset: UncheatableEvalDataset, index: int) -> dict[str, str]: + text = _extract_text(raw) + if text is None or not str(text).strip(): + raise ValueError(f"Record {index} in {dataset.name} does not contain text") + record_id = _extract_id(raw, dataset, index) + return {"id": record_id, "text": text, "source": dataset.source_label} + + +def _download_and_convert_single( + task: DownloadTask, +) -> dict[str, Any]: + session = requests.Session() + retries = Retry(total=5, backoff_factor=1.0, status_forcelist=[500, 502, 503, 504], allowed_methods=["GET"]) + adapter = HTTPAdapter(max_retries=retries) + session.mount("https://", adapter) + session.mount("http://", adapter) + + logger.info("Downloading %s from %s", task.dataset.name, task.download_url) + response = session.get(task.download_url, timeout=task.cfg.request_timeout, headers=_http_headers(task.cfg)) + response.raise_for_status() + + try: + payload = response.json() + except ValueError as exc: + raise ValueError(f"Failed to decode JSON payload for {task.dataset.name}") from exc + + if not isinstance(payload, list): + raise ValueError(f"Expected list in dataset {task.dataset.name}, found {type(payload).__name__}") + + fsspec_mkdirs(os.path.dirname(task.output_file_path), exist_ok=True) + + record_count = 0 + with atomic_rename(task.output_file_path) as temp_path: + with open_url(temp_path, "wt", encoding="utf-8", compression="gzip") as outfile: + for index, raw in enumerate(payload): + normalized = _normalize_record(raw, task.dataset, index) + json.dump(normalized, outfile, ensure_ascii=False) + outfile.write("\n") + record_count += 1 + + logger.info("Wrote %s records to %s", record_count, task.output_file_path) + return {"records": record_count, "output_file": task.output_file_path} + + +@dataclass +class DownloadTask: + download_url: str + output_file_path: str + dataset: UncheatableEvalDataset + cfg: UncheatableEvalDownloadConfig + + +def _generate_tasks( + datasets: Iterable[UncheatableEvalDataset], + cfg: UncheatableEvalDownloadConfig, +) -> tuple[list[DownloadTask], list[UncheatableEvalDataset]]: + tasks: list[DownloadTask] = [] + filtered: list[UncheatableEvalDataset] = [] + for dataset in datasets: + output_file = posixpath.join(str(cfg.output_path), dataset.output_filename()) + tasks.append(DownloadTask(dataset.download_url, output_file, dataset, cfg)) + filtered.append(dataset) + return tasks, filtered + + +def _write_metadata(cfg: UncheatableEvalDownloadConfig, records: list[dict[str, Any]]) -> None: + if not records: + return + metadata_path = posixpath.join(str(cfg.output_path), cfg.metadata_filename) + with open_url(metadata_path, "w", encoding="utf-8") as meta_file: + json.dump(records, meta_file, indent=2, ensure_ascii=False) + logger.info("Wrote metadata to %s", metadata_path) + + +def download_latest_uncheatable_eval(cfg: UncheatableEvalDownloadConfig) -> dict[str, Any]: + """Download and normalize the newest Uncheatable Eval dump for each benchmark.""" + + entries = _fetch_directory_listing(cfg) + datasets = _parse_available_dumps(entries) + latest_datasets = _select_latest_dumps(datasets) + + if not latest_datasets: + logger.warning("No datasets found that match the expected naming pattern") + return {"success": False, "reason": "no_datasets"} + + output_path = str(cfg.output_path) + fsspec_mkdirs(output_path, exist_ok=True) + + tasks, filtered_datasets = _generate_tasks(latest_datasets, cfg) + + if not tasks: + logger.info("No new datasets to process") + return {"success": True, "reason": "already_processed", "skipped": True} + + metadata_records: list[dict[str, Any]] = [] + + pipeline = ( + Dataset.from_list(tasks) + .map(lambda task: _download_and_convert_single(task)) + .write_jsonl(f"{cfg.output_path}/.metrics/part-{{shard:05d}}.jsonl", skip_existing=True) + ) + ctx = ZephyrContext(name="download-uncheatable-eval") + output_paths = ctx.execute(pipeline) + + for dataset, metadata_file in zip(filtered_datasets, output_paths, strict=True): + with open_url(metadata_file, "r", encoding="utf-8") as meta_file: + result = json.load(meta_file) + + try: + metadata_records.append( + { + "benchmark": dataset.benchmark, + "start_date": dataset.start_date, + "end_date": dataset.end_date, + "source": dataset.source_label, + "output_file": posixpath.join(output_path, dataset.output_filename()), + "records": result.get("records"), + "sha": dataset.sha, + "size": dataset.size, + } + ) + except Exception: + logger.exception("Failed to process dataset %s", dataset.name) + raise + + _write_metadata(cfg, metadata_records) + return {"success": True, "processed": metadata_records} + + +def uncheatable_eval_step( + name: str = "raw/uncheatable-eval/latest", + *, + repo_owner: str = "ziqing-huang", + repo_name: str = "uncheatable_eval", + data_path: str = "data", + branch: str = "master", + max_concurrent_downloads: int = 8, + request_timeout: int = 120, + github_token: str | None = None, + skip_existing: bool = True, + deps: list[StepSpec] | None = None, + output_path_prefix: str | None = None, + override_output_path: str | None = None, +) -> StepSpec: + """Create a StepSpec that downloads the latest Uncheatable Eval dumps.""" + + def _run(output_path: str) -> dict: + cfg = UncheatableEvalDownloadConfig( + output_path=output_path, + repo_owner=repo_owner, + repo_name=repo_name, + data_path=data_path, + branch=branch, + max_concurrent_downloads=max_concurrent_downloads, + request_timeout=request_timeout, + github_token=github_token, + skip_existing=skip_existing, + ) + return download_latest_uncheatable_eval(cfg) + + return StepSpec( + name=name, + fn=_run, + deps=deps or [], + hash_attrs={ + "repo_owner": repo_owner, + "repo_name": repo_name, + "data_path": data_path, + "branch": branch, + }, + output_path_prefix=output_path_prefix, + override_output_path=override_output_path, + ) + + +def make_uncheatable_eval_step( + *, + name: str = "raw/uncheatable-eval/latest", + repo_owner: str = "ziqing-huang", + repo_name: str = "uncheatable_eval", + data_path: str = "data", + branch: str = "master", + max_concurrent_downloads: int = 8, + request_timeout: int = 120, + github_token: str | None = None, + skip_existing: bool = True, +) -> ExecutorStep: + """Create an ExecutorStep that downloads the latest Uncheatable Eval dumps. + + Backward-compat wrapper around uncheatable_eval_step(). + """ + return uncheatable_eval_step( + name=name, + repo_owner=repo_owner, + repo_name=repo_name, + data_path=data_path, + branch=branch, + max_concurrent_downloads=max_concurrent_downloads, + request_timeout=request_timeout, + github_token=github_token, + skip_existing=skip_existing, + ).as_executor_step() + + +__all__ = [ + "UncheatableEvalDataset", + "UncheatableEvalDownloadConfig", + "download_latest_uncheatable_eval", + "make_uncheatable_eval_step", + "uncheatable_eval_step", +] diff --git a/lib/marin/src/marin/datakit/download/wikipedia.py b/lib/marin/src/marin/datakit/download/wikipedia.py new file mode 100644 index 0000000000..1dce125a0f --- /dev/null +++ b/lib/marin/src/marin/datakit/download/wikipedia.py @@ -0,0 +1,150 @@ +# Copyright The Marin Authors +# SPDX-License-Identifier: Apache-2.0 + +""" +wikipedia/download.py + +Download script for the Wikipedia raw HTML data, provided by Wikimedia. + +Home Page: https://dumps.wikimedia.org/other/enterprise_html/runs/ + +Example Usage (production, large dataset): +ENWIKI=https://dumps.wikimedia.org/other/enterprise_html/runs/20250320/enwiki-NS0-20250320-ENTERPRISE-HTML.json.tar.gz +uv run zephyr --backend=ray --max-parallelism=10 \ + lib/marin/src/marin/download/wikipedia/download.py \ + --input_urls $ENWIKI \ + --revision 20250320 --output_path gs://path/to/output + +Example Usage (local testing, small dataset): +SIMPLEWIKI=https://dumps.wikimedia.org/other/enterprise_html/runs/20250320/simplewiki-NS0-20250320-ENTERPRISE-HTML.json.tar.gz +uv run zephyr --backend=threadpool --max-parallelism=4 --entry-point=download \ + lib/marin/src/marin/download/wikipedia/download.py \ + --input_urls "[$SIMPLEWIKI]" \ + --revision 20250320 --output_path /tmp/wikipedia_test + +Note: The enwiki-NS0 file (English Wikipedia, namespace 0 = articles) is approximately 130 GB compressed. + The simplewiki-NS0 file (Simple English Wikipedia) is much smaller at ~2 GB compressed. +""" + +import logging +import os +import tarfile +from collections.abc import Iterable +from dataclasses import dataclass + +import draccus +import requests +from iris.marin_fs import open_url +from marin.execution.step_spec import StepSpec +from marin.utils import fsspec_size +from tqdm_loggable.auto import tqdm +from zephyr import Dataset, ZephyrContext, atomic_rename, load_jsonl + +logger = logging.getLogger(__name__) + + +@dataclass +class WikipediaDownloadConfig: + input_urls: list[str] + revision: str + output_path: str + + +def download_tar(url: str, output_prefix) -> str: + shard_filename = url.split("/")[-1] + output_filename = os.path.join(output_prefix, shard_filename) + logger.info(f"Downloading URL: {url} to {output_filename}") + + try: + total_size = fsspec_size(url) + pbar = tqdm(total=total_size, desc="Downloading File", unit="B", unit_scale=True) + + with atomic_rename(output_filename) as tmp_filename, open_url(tmp_filename, "wb") as f: + r = requests.get(url, stream=True) + + for chunk in r.raw.stream(20 * 1024 * 1024, decode_content=False): + if chunk: + f.write(chunk) + f.flush() + + pbar.update(len(chunk)) + + return output_filename + except Exception as e: + logger.error(f"Error downloading URL: {url}") + raise e + + +def process_file(input_file: str, output_path: str) -> Iterable[str]: + logger.info(f"Processing file: {input_file}") + logger.info(f"Output path: {output_path}") + + try: + with open_url(input_file) as f: + with tarfile.open(fileobj=f, mode="r:gz") as tr: + for info in tr: + with tr.extractfile(info) as file: + file_content = file.read() + file_path = os.path.join(output_path, info.name + ".gz") + + # Each file is a .ndjson file, which contains about 18k-21k articles + # per file with size ranging from 200MB to 300MB + with ( + atomic_rename(file_path) as tmpfile_path, + open_url(tmpfile_path, "wb", compression="gzip") as output_f, + ): + output_f.write(file_content) + yield file_path + + except Exception as e: + logger.error(f"Error processing file: {input_file}") + raise e + + +@draccus.wrap() +def download(cfg: WikipediaDownloadConfig) -> None: + """Download and process Wikipedia data.""" + logger.info("Starting transfer of Wikipedia dump...") + output_base = os.path.join(cfg.output_path, cfg.revision) + + ctx = ZephyrContext(name="download-wikipedia") + download_metrics = ctx.execute( + Dataset.from_list(cfg.input_urls) + .map(lambda url: download_tar(url, output_base)) + .write_jsonl(f"{output_base}/.metrics/download-{{shard:05d}}.jsonl", skip_existing=True), + ) + + # load all of the output filenames to process + downloads = ctx.execute(Dataset.from_list(download_metrics).flat_map(load_jsonl)) + + extracted = ctx.execute( + Dataset.from_list(downloads) + .flat_map(lambda file: process_file(file, output_base)) + .write_jsonl(f"{output_base}/.metrics/process-{{shard:05d}}.jsonl", skip_existing=True), + ) + + logger.info("Wikipedia dump transfer complete, wrote: %s", list(extracted)) + + +def wikipedia_step( + name: str = "raw/wikipedia", + *, + input_urls: list[str], + revision: str, + deps: list[StepSpec] | None = None, + output_path_prefix: str | None = None, + override_output_path: str | None = None, +) -> StepSpec: + """Create a StepSpec that downloads and processes Wikipedia HTML dumps.""" + + def _run(output_path: str) -> None: + download(WikipediaDownloadConfig(input_urls=input_urls, revision=revision, output_path=output_path)) + + return StepSpec( + name=name, + fn=_run, + deps=deps or [], + hash_attrs={"input_urls": input_urls, "revision": revision}, + output_path_prefix=output_path_prefix, + override_output_path=override_output_path, + ) From df63b150e47afcb414756c562437989f25dd1e5f Mon Sep 17 00:00:00 2001 From: Rafal Wojdyla Date: Wed, 25 Mar 2026 10:14:52 -0700 Subject: [PATCH 06/56] Convert marin.download.* to backward-compat re-export shims All download module implementations now live in marin.datakit.download.*. The old marin.download.* files are replaced with explicit re-exports from the canonical locations. Renamed configs (Ar5ivDownloadConfig, WikipediaDownloadConfig) are re-exported under their original names for backward compat. Co-Authored-By: Claude Opus 4.6 (1M context) --- lib/marin/src/marin/download/__init__.py | 7 +- .../src/marin/download/ar5iv/__init__.py | 2 + .../src/marin/download/ar5iv/download.py | 136 +----- .../src/marin/download/dclm_hq/__init__.py | 2 + .../download/dclm_hq/download_dclm_hq_html.py | 214 +--------- .../src/marin/download/filesystem/__init__.py | 2 + .../src/marin/download/filesystem/transfer.py | 68 +-- .../marin/download/huggingface/__init__.py | 2 + .../marin/download/huggingface/download_hf.py | 353 +--------------- .../huggingface/stream_remove_columns.py | 104 +---- .../download/huggingface/upload_gcs_to_hf.py | 362 +--------------- .../marin/download/nemotron_cc/__init__.py | 2 + .../nemotron_cc/download_nemotron_cc.py | 120 +----- .../download/uncheatable_eval/__init__.py | 2 + .../download/uncheatable_eval/download.py | 396 +----------------- .../src/marin/download/wikipedia/__init__.py | 2 + .../src/marin/download/wikipedia/download.py | 126 +----- 17 files changed, 64 insertions(+), 1836 deletions(-) create mode 100644 lib/marin/src/marin/download/ar5iv/__init__.py create mode 100644 lib/marin/src/marin/download/dclm_hq/__init__.py create mode 100644 lib/marin/src/marin/download/filesystem/__init__.py create mode 100644 lib/marin/src/marin/download/huggingface/__init__.py create mode 100644 lib/marin/src/marin/download/nemotron_cc/__init__.py create mode 100644 lib/marin/src/marin/download/uncheatable_eval/__init__.py create mode 100644 lib/marin/src/marin/download/wikipedia/__init__.py diff --git a/lib/marin/src/marin/download/__init__.py b/lib/marin/src/marin/download/__init__.py index b5a56a002d..26067cbf97 100644 --- a/lib/marin/src/marin/download/__init__.py +++ b/lib/marin/src/marin/download/__init__.py @@ -1,6 +1,7 @@ # Copyright The Marin Authors # SPDX-License-Identifier: Apache-2.0 +# Backward-compat shim. Canonical location: marin.datakit.download -from .huggingface.download_hf import DownloadConfig as HfDownloadConfig -from .huggingface.download_hf import download_hf -from .huggingface.download_hf import download_hf as download_hf_ungated +from marin.datakit.download.huggingface import DownloadConfig as HfDownloadConfig +from marin.datakit.download.huggingface import download_hf +from marin.datakit.download.huggingface import download_hf as download_hf_ungated diff --git a/lib/marin/src/marin/download/ar5iv/__init__.py b/lib/marin/src/marin/download/ar5iv/__init__.py new file mode 100644 index 0000000000..ec8bc038b7 --- /dev/null +++ b/lib/marin/src/marin/download/ar5iv/__init__.py @@ -0,0 +1,2 @@ +# Copyright The Marin Authors +# SPDX-License-Identifier: Apache-2.0 diff --git a/lib/marin/src/marin/download/ar5iv/download.py b/lib/marin/src/marin/download/ar5iv/download.py index 9483370c71..1a64dbf93e 100644 --- a/lib/marin/src/marin/download/ar5iv/download.py +++ b/lib/marin/src/marin/download/ar5iv/download.py @@ -1,135 +1,7 @@ # Copyright The Marin Authors # SPDX-License-Identifier: Apache-2.0 +# Backward-compat shim. Canonical location: marin.datakit.download.ar5iv -""" -Download and process Ar5iv dataset from a zip file. - -Example Usage: -uv run zephyr --backend=ray --max-parallelism=1000 --memory=10GB \ - lib/marin/src/marin/download/ar5iv/download.py \ - --input_path gs://bucket/ar5iv.zip \ - --output_path gs://bucket/output -""" - -import json -import logging -import zipfile -from collections import defaultdict -from dataclasses import dataclass - -import draccus -from iris.marin_fs import open_url -from zephyr import Dataset, ZephyrContext -from zephyr.writers import atomic_rename -from iris.logging import configure_logging - -logger = logging.getLogger(__name__) - - -@dataclass -class DownloadConfig: - input_path: str - output_path: str - max_files: int | None = None # Maximum number of shards to process - - -def process_shard(shard_task: dict) -> dict: - """ - Process a single shard by extracting its files from the zip in GCS and uploading the merged JSONL. - - Args: - shard_task: Dict with keys 'input_path', 'output_path', 'shard_id', 'file_list' - """ - input_path = shard_task["input_path"] - output_path = shard_task["output_path"] - shard_id = shard_task["shard_id"] - file_list = shard_task["file_list"] - gcs_path = f"{output_path}/{shard_id}.jsonl.gz" - - with open_url(str(input_path), "rb") as f: - with zipfile.ZipFile(f) as zf: - with atomic_rename(gcs_path) as temp_path, open_url(temp_path, "wt", compression="gzip") as out_f: - for filename in file_list: - with zf.open(filename, "r") as file_handle: - content = file_handle.read() - record = { - "filename": filename, - "format": "html", - "content": content.decode("utf-8", errors="replace"), - } - print(json.dumps(record), file=out_f) - - logger.info(f"Shard {shard_id} with {len(file_list)} files uploaded to {gcs_path}") - return {"shard_id": shard_id, "num_files": len(file_list), "output_path": gcs_path} - - -def download(cfg: DownloadConfig) -> None: - """ - Download and process Ar5iv dataset from a zip file in GCS. - - This function can be called by the executor framework or used standalone. - """ - logger.info("Starting transfer of Ar5iv dataset...") - logger.info(f"Source: {cfg.input_path}") - - # Use fsspec+zipfile to list all files - with open_url(str(cfg.input_path), "rb") as f: - with zipfile.ZipFile(f) as zf: - all_files = zf.infolist() - - # Group by shard directory - # We assume structure: something like: shard_id/.../file - # shard_id is derived from the second last component if files are nested. - # Adjust as needed if directory structure differs. - shard_dict = defaultdict(list) - for info in all_files: - if info.is_dir(): - continue - # E.g. path might look like: "003/something.html" - # Extract shard_id from the directory: - # Split by "/" and take the first part if we assume structure {shard_id}/file - parts = info.filename.strip("/").split("/") - if len(parts) < 2: - # File at root level - decide how to handle this case. - # If no directory structure is given, skip or treat differently. - continue - shard_id = parts[-2] # get the second-last directory as shard_id - shard_dict[shard_id].append(info.filename) - - # Apply max_files limit if provided - shard_ids = list(shard_dict.keys()) - if cfg.max_files is not None: - shard_ids = shard_ids[: cfg.max_files] - - logger.info(f"Found {len(shard_ids)} shards to process.") - - # Build task list for each shard - shard_tasks = [] - for shard_id in shard_ids: - shard_tasks.append( - { - "input_path": cfg.input_path, - "output_path": cfg.output_path, - "shard_id": shard_id, - "file_list": shard_dict[shard_id], - } - ) - - # Execute pipeline with zephyr - pipeline = ( - Dataset.from_list(shard_tasks) - .map(process_shard) - .write_jsonl(f"{cfg.output_path}/.metrics/part-{{shard:05d}}.jsonl", skip_existing=True) - ) - ctx = ZephyrContext(name="download-ar5iv") - ctx.execute(pipeline) - - logger.info("Transfer completed successfully!") - - -@draccus.wrap() -def main(cfg: DownloadConfig) -> None: - """CLI entrypoint for downloading and processing Ar5iv dataset.""" - - configure_logging(level=logging.INFO) - download(cfg) +from marin.datakit.download.ar5iv import Ar5ivDownloadConfig as DownloadConfig # noqa: F401 - used by tests +from marin.datakit.download.ar5iv import download as download +from marin.datakit.download.ar5iv import process_shard as process_shard diff --git a/lib/marin/src/marin/download/dclm_hq/__init__.py b/lib/marin/src/marin/download/dclm_hq/__init__.py new file mode 100644 index 0000000000..ec8bc038b7 --- /dev/null +++ b/lib/marin/src/marin/download/dclm_hq/__init__.py @@ -0,0 +1,2 @@ +# Copyright The Marin Authors +# SPDX-License-Identifier: Apache-2.0 diff --git a/lib/marin/src/marin/download/dclm_hq/download_dclm_hq_html.py b/lib/marin/src/marin/download/dclm_hq/download_dclm_hq_html.py index 9250ede43d..a49caab9d7 100644 --- a/lib/marin/src/marin/download/dclm_hq/download_dclm_hq_html.py +++ b/lib/marin/src/marin/download/dclm_hq/download_dclm_hq_html.py @@ -1,208 +1,10 @@ # Copyright The Marin Authors # SPDX-License-Identifier: Apache-2.0 - -""" -Download DCLM HQ HTML data by fetching HTML content from Common Crawl. - -Processes DCLM HQ JSONL files and enriches them with HTML content fetched from Common Crawl -via a custom index server. Uses zephyr for parallel processing with flattened parallelism. - -Example Usage: -uv run zephyr --backend=ray --max-parallelism=800 --memory=2GB \ - lib/marin/src/marin/download/dclm_hq/download_dclm_hq_html.py \ - --input_path gs://marin-us-central2/raw/dclm-baseline-1.0-parquet/global/ \ - --output_path gs://marin-data/processed/dclm-hq-html/ -""" - -import io -import json -import logging -import os -import re -from dataclasses import dataclass - -import requests -from iris.marin_fs import open_url -import warcio -from marin.utils import fsspec_glob -from tqdm import tqdm -from zephyr import Dataset, ZephyrContext -from zephyr.writers import ensure_parent_dir - -CC_IDX_HOST_URL = "http://34.72.201.218:8080" -logger = logging.getLogger(__name__) - - -@dataclass -class DCLMHQDownloadConfig: - input_path: str - output_path: str - - -@dataclass -class FileTask: - """Represents a single file processing task.""" - - input_file_path: str - output_file_path: str - - -def fetch_warc_from_cc(s3_warc_path: str, length: int, offset: int) -> str: - """ - Fetch a WARC record from Common Crawl S3 bucket using byte range requests we get - from the CC index via `find_html_in_cc`. - Args: - s3_warc_path: Path to WARC file in S3 bucket - length: Length of the record in bytes - offset: Byte offset of the record in the WARC file - Returns: - The WARC record content as a string - """ - # Convert string values to integers - offset = int(offset) - length = int(length) - - # Make range request to CommonCrawl - response = requests.get( - f"https://data.commoncrawl.org/{s3_warc_path}", headers={"Range": f"bytes={offset}-{offset + length - 1}"} - ) - response.raise_for_status() - - # Parse WARC record and extract HTML content - with io.BytesIO(response.content) as stream: - for record in warcio.ArchiveIterator(stream): - content = record.content_stream().read() - return content.decode(errors="ignore") - - raise ValueError(f"No WARC records found in response from {s3_warc_path}") - - -def find_html_in_cc(split_id: str, target_uri: str) -> str | None: - """ - We host our own index of the Common Crawl over GCP which we use in this function. - For each call we receive a list of chunks that contain the HTML content for the given target URI. - We then fetch each chunk and concatenate them together to form the complete HTML content. - Args: - split_id: The split ID of the Common Crawl - target_uri: The target URI to find the HTML content for - Returns: - The HTML content as a string - """ - resp = requests.get(f"{CC_IDX_HOST_URL}/{split_id}-index?url={target_uri}&output=json") - - resp.raise_for_status() - - chunks = [json.loads(chunk) for chunk in resp.text.split("\n") if chunk] - sorted_chunks = sorted(chunks, key=lambda x: x["offset"]) - - html_content = "" - - for chunk in sorted_chunks: - warc_path = chunk["filename"] - length = chunk["length"] - offset = chunk["offset"] - - warc_record = fetch_warc_from_cc(warc_path, length, offset) - - html_content += warc_record - - return html_content - - -def process_file(task: FileTask) -> None: - """Process a single DCLM file, fetching HTML from Common Crawl. - - Args: - task: FileTask containing input and output file paths - """ - logger.info(f"Starting processing of file {task.input_file_path}") - logger.info(f"Source: {task.input_file_path}") - logger.info(f"Destination: {task.output_file_path}") - try: - ensure_parent_dir(task.output_file_path) - with ( - open_url(task.input_file_path, compression="zstd") as source, - open_url(task.output_file_path, "wt", compression="gzip") as output, - ): - text_wrapper = io.TextIOWrapper(source, encoding="utf-8") - - for line in tqdm(text_wrapper, desc="Processing lines"): - row = json.loads(line.strip()) - - # We need to extract the split from where the record was for querying the index - # The only place we have this information is in the warcinfo key in DCLM HQ - # The format is: - # warc-type: WARC/1.1 - # ... - # isPartOf: CC-MAIN-2024-01 - # This however is a string and not a key-value pair, so we need to extract - # the split from it via regex pattern `isPartOf:\s*(CC-MAIN-\d{4}-\d{2})`. - # This pattern groups the value of the key `isPartOf` that is of the form - # `CC-MAIN-xxxx-xx` where `xxxx` is a year and `xx` is a month. - match = re.search(r"isPartOf:\s*(CC-MAIN-\d{4}-\d{2})", row["metadata"]["warcinfo"]) - if match is None: - logger.error(f"No split found for record ID: {row['metadata']['WARC-Record-ID']}") - continue - - is_part_of = match.group(1) - - try: - html_string = find_html_in_cc(is_part_of, row["metadata"]["WARC-Target-URI"]) - - if html_string is None: - logger.error(f"No HTML found for record ID: {row['metadata']['WARC-Record-ID']}") - continue - - if "text" in row: - row.pop("text") - - row["html"] = html_string - - print(json.dumps(row), file=output) - except Exception as e: - logger.exception(f"Error processing line: {e}") - continue - - logger.info("\nProcessing completed successfully!") - logger.info(f"File available at: {task.output_file_path}") - - except Exception as e: - logger.error(f"Error during processing: {e}") - raise - - -def extract_dclm_hq_dump(cfg: DCLMHQDownloadConfig) -> None: - """Process the DCLM HQ dump in the input path and save the results to the output path. - - Flattens the nested directory structure (shards → files) into a single list of files - and processes them in parallel using zephyr. - """ - logger.info(f"Starting processing of DCLM HQ dump in {cfg.input_path}") - - # Flatten nested structure: discover all files upfront - all_files = [] - paths = [i.split("/")[-1] for i in fsspec_glob(os.path.join(cfg.input_path, "*"))] - - logger.info(f"Found {len(paths)} shards to process") - - for path in paths: - input_path = os.path.join(cfg.input_path, path) - shard_paths = fsspec_glob(os.path.join(input_path, "*.json.zst")) - - for shard_path in shard_paths: - input_file_path = shard_path - output_file_path = os.path.join(cfg.output_path, path, os.path.basename(shard_path)).replace( - ".json.zst", ".jsonl.gz" - ) - - all_files.append(FileTask(input_file_path=input_file_path, output_file_path=output_file_path)) - - logger.info(f"Found {len(all_files)} files to process") - - # Single-level parallelism over all files - pipeline = Dataset.from_list(all_files).map(process_file) - - ctx = ZephyrContext(name="download-dclm-html") - ctx.execute(pipeline) - - logger.info("Processing completed successfully!") +# Backward-compat shim. Canonical location: marin.datakit.download.dclm_hq + +from marin.datakit.download.dclm_hq import DCLMHQDownloadConfig as DCLMHQDownloadConfig +from marin.datakit.download.dclm_hq import FileTask as FileTask +from marin.datakit.download.dclm_hq import extract_dclm_hq_dump as extract_dclm_hq_dump +from marin.datakit.download.dclm_hq import fetch_warc_from_cc as fetch_warc_from_cc +from marin.datakit.download.dclm_hq import find_html_in_cc as find_html_in_cc +from marin.datakit.download.dclm_hq import process_file as process_file diff --git a/lib/marin/src/marin/download/filesystem/__init__.py b/lib/marin/src/marin/download/filesystem/__init__.py new file mode 100644 index 0000000000..ec8bc038b7 --- /dev/null +++ b/lib/marin/src/marin/download/filesystem/__init__.py @@ -0,0 +1,2 @@ +# Copyright The Marin Authors +# SPDX-License-Identifier: Apache-2.0 diff --git a/lib/marin/src/marin/download/filesystem/transfer.py b/lib/marin/src/marin/download/filesystem/transfer.py index e28a6667d8..5456bf8cc5 100644 --- a/lib/marin/src/marin/download/filesystem/transfer.py +++ b/lib/marin/src/marin/download/filesystem/transfer.py @@ -1,68 +1,6 @@ # Copyright The Marin Authors # SPDX-License-Identifier: Apache-2.0 +# Backward-compat shim. Canonical location: marin.datakit.download.filesystem -import os -import random -import time -from dataclasses import dataclass - -from iris.marin_fs import url_to_fs -from zephyr import Dataset, ZephyrContext - -from marin.utils import fsspec_exists, fsspec_glob - - -@dataclass -class TransferConfig: - input_path: str - output_path: str - - # Selectively choose the number of random files to transfer. None means all files - num_random_files: int | None = None - filetype: str = "jsonl.zst" - - -def transfer_files(config: TransferConfig) -> None: - """Transfers files from the input path to the output path. - - When num_random_files is None, copies the entire directory recursively. - When num_random_files is specified, randomly samples that many files and - copies them in parallel using zephyr. - """ - if config.input_path.endswith("/"): - input_path = config.input_path[:-1] - else: - input_path = config.input_path - - print(f"Downloading {input_path} from GCS.") - start_time: float = time.time() - fs, _ = url_to_fs(input_path) - if not fs.exists(input_path): - raise FileNotFoundError(f"{input_path} does not exist.") - - # Glob all matching files - filenames = fsspec_glob(os.path.join(input_path, f"**/*.{config.filetype}")) - - # Select files: either random sample or all files - if config.num_random_files is None: - selected_files = filenames - else: - random.seed(42) - random.shuffle(filenames) - selected_files = filenames[: config.num_random_files] - - def copy_file(filename: str) -> None: - """Copy a single file if it doesn't already exist at destination.""" - output_filename = os.path.join(config.output_path, os.path.basename(filename)) - if not fsspec_exists(output_filename): - # Ensure output directory exists - fs.makedirs(config.output_path, exist_ok=True) - fs.copy(filename, output_filename) - - # Always use parallel copying via zephyr - pipeline = Dataset.from_list(selected_files).map(copy_file) - ctx = ZephyrContext(name="fs-transfer") - ctx.execute(pipeline) - - elapsed_time_seconds: float = time.time() - start_time - print(f"Downloaded {input_path} to {config.output_path} ({elapsed_time_seconds}s).") +from marin.datakit.download.filesystem import TransferConfig as TransferConfig +from marin.datakit.download.filesystem import transfer_files as transfer_files diff --git a/lib/marin/src/marin/download/huggingface/__init__.py b/lib/marin/src/marin/download/huggingface/__init__.py new file mode 100644 index 0000000000..ec8bc038b7 --- /dev/null +++ b/lib/marin/src/marin/download/huggingface/__init__.py @@ -0,0 +1,2 @@ +# Copyright The Marin Authors +# SPDX-License-Identifier: Apache-2.0 diff --git a/lib/marin/src/marin/download/huggingface/download_hf.py b/lib/marin/src/marin/download/huggingface/download_hf.py index 089ef63e0c..9912a5d2c0 100644 --- a/lib/marin/src/marin/download/huggingface/download_hf.py +++ b/lib/marin/src/marin/download/huggingface/download_hf.py @@ -1,353 +1,12 @@ -#!/usr/bin/env python3 # Copyright The Marin Authors # SPDX-License-Identifier: Apache-2.0 +# Backward-compat shim. Canonical location: marin.datakit.download.huggingface -""" -A script to download a HuggingFace dataset and upload it to a specified fsspec path, -using HfFileSystem for direct streaming of data transfer. -""" - -import logging -import os -import random -import socket -import time -from dataclasses import dataclass, field - -import draccus -import huggingface_hub -from huggingface_hub import HfFileSystem -from iris.marin_fs import open_url, url_to_fs -from huggingface_hub.errors import HfHubHTTPError -from packaging.version import Version -from marin.execution.executor import THIS_OUTPUT_PATH -from marin.utilities.validation_utils import write_provenance_json -from zephyr import Dataset, ZephyrContext -from zephyr.writers import atomic_rename -from iris.logging import configure_logging - -logger = logging.getLogger(__name__) - -HF_PROTOCOL_PREFIX = "hf://" -HF_BUCKET_PATH_PREFIX = "buckets/" - - -@dataclass(frozen=True) -class DownloadConfig: - # fmt: off - - # HuggingFace Dataset Parameters - hf_dataset_id: str # HF Dataset to Download (as `$ORG/$DATASET` on HF Hub) - - revision: str # (Short) Commit Hash (from HF Dataset Repo; 7 characters) - hf_urls_glob: list[str] = field(default_factory=list) - # List of Glob Patterns to Match Files in HF Dataset, If empty we get all the files in a hf repo - - gcs_output_path: str = THIS_OUTPUT_PATH - """ - Path to store raw data in persistent storage (e.g. gs://$BUCKET/...). - This works with any fsspec-compatible path, but for backwards compatibility, we call it gcs_output_path. - """ - - append_sha_to_path: bool = False - """If true, write outputs under ``gcs_output_path/`` instead of directly under ``gcs_output_path``.""" - - # Job Control Parameters, used only for non-gated dataset transfers done via STS - wait_for_completion: bool = True # if True, will block until job completes - - # fmt: on - hf_repo_type_prefix: str = ( - "datasets" # The repo_type_prefix is datasets/ for datasets, - # spaces/ for spaces, and models do not need a prefix in the URL. - ) - - zephyr_max_parallelism: int = 8 - """Maximum parallelism of the Zephyr download job""" - - read_timeout_seconds: float = 120.0 - """Socket read timeout while streaming each HF file. Timeout failures trigger retries.""" - - progress_log_interval_seconds: float = 60.0 - """Log a heartbeat for each in-flight shard every N seconds while bytes are flowing.""" - - read_chunk_size_mib: int = 8 - """Chunk size for each streaming read from HF.""" - - -def _strip_hf_protocol(path: str) -> str: - return path.removeprefix(HF_PROTOCOL_PREFIX).lstrip("/") - - -def _resolve_hf_source_path(cfg: DownloadConfig) -> str: - source_path = ( - os.path.join(cfg.hf_repo_type_prefix, cfg.hf_dataset_id) if cfg.hf_repo_type_prefix else cfg.hf_dataset_id - ) - return _strip_hf_protocol(source_path) - - -def _assert_bucket_support_available(source_path: str) -> None: - if not source_path.startswith(HF_BUCKET_PATH_PREFIX): - return - - if Version(huggingface_hub.__version__) < Version("1.6.0"): - raise RuntimeError( - f"Bucket paths require huggingface_hub>=1.6.0, found {huggingface_hub.__version__}. " - "Upgrade the runtime environment to a buckets-capable huggingface_hub version." - ) - - -def _relative_path_in_source(file_path: str, source_path: str) -> str: - normalized_file = _strip_hf_protocol(file_path) - normalized_source = _strip_hf_protocol(source_path).rstrip("/") - - source_prefix = f"{normalized_source}/" - if normalized_file.startswith(source_prefix): - return normalized_file.removeprefix(source_prefix) - - source_parts = [segment for segment in normalized_source.split("/") if segment] - file_parts = [segment for segment in normalized_file.split("/") if segment] - - if len(file_parts) >= len(source_parts): - matches_source = True - for source_segment, file_segment in zip(source_parts, file_parts, strict=False): - if source_segment == file_segment: - continue - if file_segment.split("@", 1)[0] == source_segment: - continue - matches_source = False - break - - if matches_source: - return "/".join(file_parts[len(source_parts) :]) - - # Backwards-compatible fallback for historical dataset path layout. - return normalized_file.split("/", 3)[-1] - - -def ensure_fsspec_path_writable(output_path: str) -> None: - """Check if the fsspec path is writable by trying to create and delete a temporary file.""" - fs, _ = url_to_fs(output_path) - try: - fs.mkdirs(output_path, exist_ok=True) - test_path = os.path.join(output_path, "test_write_access") - with fs.open(test_path, "w") as f: - f.write("test") - fs.rm(test_path) - except Exception as e: - raise ValueError(f"No write access to fsspec path: {output_path} ({e})") from e - - -def stream_file_to_fsspec( - gcs_output_path: str, - file_path: str, - fsspec_file_path: str, - expected_size: int | None = None, - read_timeout_seconds: float = 120.0, - progress_log_interval_seconds: float = 60.0, - read_chunk_size_mib: int = 8, -): - """Stream a file from HfFileSystem to another fsspec path using atomic write. - - Uses atomic_rename to write to a temp file first, then rename on success. - This enables recovery across individual files if the job is interrupted. - - Args: - gcs_output_path: Base output path for the download. - file_path: Source file path on HuggingFace. - fsspec_file_path: Target file path on the destination filesystem. - expected_size: Expected file size in bytes for validation. If provided, - the download will fail if the downloaded size doesn't match. - """ - hf_fs = HfFileSystem(token=os.environ.get("HF_TOKEN", False)) - target_fs, _ = url_to_fs(gcs_output_path) - chunk_size = max(1, int(read_chunk_size_mib)) * 1024 * 1024 - max_retries = 20 - # 15 minutes max sleep - max_sleep = 15 * 60 - # Minimum base wait time to avoid too-fast retries - min_base_wait = 5 - - # Retry when there is an error, such as hf rate limit - last_exception = None - for attempt in range(max_retries): - try: - target_fs.mkdirs(os.path.dirname(fsspec_file_path), exist_ok=True) - bytes_written = 0 - with atomic_rename(fsspec_file_path) as temp_path: - previous_socket_timeout = socket.getdefaulttimeout() - socket.setdefaulttimeout(read_timeout_seconds) - try: - with ( - hf_fs.open(file_path, "rb", block_size=chunk_size) as src_file, - open_url(temp_path, "wb") as dest_file, - ): - start_time = time.monotonic() - next_progress_log = start_time + progress_log_interval_seconds - while True: - try: - chunk = src_file.read(chunk_size) - except TimeoutError as timeout_error: - raise TimeoutError( - f"Timed out reading from {file_path} after " - f"{read_timeout_seconds:.1f}s with {bytes_written} bytes written" - ) from timeout_error - if not chunk: - break - dest_file.write(chunk) - bytes_written += len(chunk) - now = time.monotonic() - if progress_log_interval_seconds > 0 and now >= next_progress_log: - elapsed = max(now - start_time, 1e-9) - speed_mib_s = (bytes_written / (1024**2)) / elapsed - logger.info( - f"Streaming {file_path}: {bytes_written / (1024**2):.1f} MiB written " - f"in {elapsed:.1f}s ({speed_mib_s:.2f} MiB/s)" - ) - next_progress_log = now + progress_log_interval_seconds - finally: - socket.setdefaulttimeout(previous_socket_timeout) - - # Validate file size BEFORE atomic_rename commits the file - if expected_size is not None and bytes_written != expected_size: - raise ValueError( - f"Size mismatch for {file_path}: expected {expected_size} bytes, got {bytes_written} bytes" - ) - - logger.info(f"Streamed {file_path} successfully to {fsspec_file_path} ({bytes_written} bytes)") - return {"file_path": file_path, "status": "success", "size": bytes_written} - except Exception as e: - last_exception = e - # Base wait: min 5s, then exponential: 5, 10, 20, 40, 80, 160, 320, 600 (capped) - wait_base = max(min_base_wait, min_base_wait * (2**attempt)) - - error_type = type(e).__name__ - error_msg = str(e) - status_code = -1 - - if isinstance(e, HfHubHTTPError): - status_code = e.response.status_code - TOO_MANY_REQUESTS = 429 - if status_code == TOO_MANY_REQUESTS: - # NOTE: RateLimit "api\|pages\|resolvers";r=[remaining];t=[seconds remaining until reset] - try: - rate_limit_wait = int(e.response.headers["RateLimit"].split(";")[-1].split("=")[-1]) - wait_base = max(wait_base, rate_limit_wait + 10) # Add buffer to rate limit wait - except Exception: - logger.warning("Failed to parse rate limit header, using default wait period") - - logger.warning( - f"Attempt {attempt + 1}/{max_retries} failed for {file_path}: " - f"{error_type} (status={status_code}): {error_msg}" - ) - - jitter = random.uniform(0, min(wait_base * 0.25, 30)) # Up to 25% jitter, max 30s - wait_time = min(wait_base + jitter, max_sleep) - - logger.info(f"Retrying {file_path} in {wait_time:.1f}s...") - time.sleep(wait_time) - - raise RuntimeError( - f"Failed to download {file_path} after {max_retries} attempts. " - f"Last error: {type(last_exception).__name__}: {last_exception}" - ) - - -def download_hf(cfg: DownloadConfig) -> None: - - configure_logging(level=logging.INFO) - - # Set cfg.append_sha_to_path=True to mimic the older behavior of writing to gcs_output_path/. - # Some historical datasets were written that way, so this flag keeps backwards compatibility when needed. - - # Ensure the output path is writable - try: - output_path = os.path.join(cfg.gcs_output_path, cfg.revision) if cfg.append_sha_to_path else cfg.gcs_output_path - ensure_fsspec_path_writable(output_path) - except ValueError as e: - logger.exception(f"Output path validation failed: {e}") - raise e - - # Initialize Hugging Face filesystem - logger.info("Identifying files to download from HuggingFace...") - hf_fs = HfFileSystem(token=os.environ.get("HF_TOKEN", False)) - hf_source_path = _resolve_hf_source_path(cfg) - _assert_bucket_support_available(hf_source_path) - - if not cfg.hf_urls_glob: - # We get all the files using find - files = hf_fs.find(hf_source_path, revision=cfg.revision) - else: - # Get list of files directly from HfFileSystem matching the pattern - files = [] - for hf_url_glob in cfg.hf_urls_glob: - pattern = os.path.join(hf_source_path, hf_url_glob) - files += hf_fs.glob(pattern, revision=cfg.revision) - - if not files: - raise ValueError(f"No files found for dataset `{cfg.hf_dataset_id}. Used glob patterns: {cfg.hf_urls_glob}") - - # Get file sizes for validation - logger.info("Getting file sizes for validation...") - file_sizes: dict[str, int | None] = {} - for file in files: - try: - info = hf_fs.info(file, revision=cfg.revision) - file_sizes[file] = info.get("size") or None - except Exception as e: - logger.warning(f"Could not get size for {file}: {e}") - file_sizes[file] = None # Will skip validation for this file - - download_tasks = [] - - for file in files: - try: - relative_file_path = _relative_path_in_source(file, hf_source_path) - if relative_file_path.startswith(".."): - raise ValueError(f"Computed path escapes source root: source={hf_source_path}, file={file}") - fsspec_file_path = os.path.join(output_path, relative_file_path) - expected_size = file_sizes.get(file) - download_tasks.append( - ( - output_path, - file, - fsspec_file_path, - expected_size, - cfg.read_timeout_seconds, - cfg.progress_log_interval_seconds, - cfg.read_chunk_size_mib, - ) - ) - except Exception as e: - logging.exception(f"Error preparing task for {file}: {e}") - - total_files = len(download_tasks) - total_size_gb = sum(s for s in file_sizes.values() if s is not None) / (1024**3) - logger.info(f"Total number of files to process: {total_files} ({total_size_gb:.2f} GB)") - - pipeline = ( - Dataset.from_list(download_tasks) - .map(lambda task: stream_file_to_fsspec(*task)) - .write_jsonl( - f"{cfg.gcs_output_path}/.metrics/success-part-{{shard:05d}}-of-{{total:05d}}.jsonl", skip_existing=True - ) - ) - ctx = ZephyrContext(name="download-hf", max_workers=cfg.zephyr_max_parallelism) - ctx.execute(pipeline) - - # Write Provenance JSON - write_provenance_json( - output_path, - metadata={"dataset": cfg.hf_dataset_id, "version": cfg.revision, "links": files}, - ) - - logger.info(f"Streamed all files and wrote provenance JSON; check {output_path}.") - - -@draccus.wrap() -def main(cfg: DownloadConfig) -> None: - """Download HuggingFace dataset.""" - download_hf(cfg) - +from marin.datakit.download.huggingface import DownloadConfig as DownloadConfig +from marin.datakit.download.huggingface import download_hf as download_hf +from marin.datakit.download.huggingface import ensure_fsspec_path_writable as ensure_fsspec_path_writable +from marin.datakit.download.huggingface import main as main +from marin.datakit.download.huggingface import stream_file_to_fsspec as stream_file_to_fsspec if __name__ == "__main__": main() diff --git a/lib/marin/src/marin/download/huggingface/stream_remove_columns.py b/lib/marin/src/marin/download/huggingface/stream_remove_columns.py index b16e3a1f1b..6d5d39f492 100644 --- a/lib/marin/src/marin/download/huggingface/stream_remove_columns.py +++ b/lib/marin/src/marin/download/huggingface/stream_remove_columns.py @@ -1,101 +1,9 @@ # Copyright The Marin Authors # SPDX-License-Identifier: Apache-2.0 +# Backward-compat shim. Canonical location: marin.datakit.download.stream_remove_columns -"""Remove unnecessary columns while streaming data from huggingface.""" - -import logging -import os -from dataclasses import dataclass - -import pandas as pd -import pyarrow.parquet as pq -from huggingface_hub import HfFileSystem -from tqdm import tqdm -from zephyr import Dataset, ZephyrContext - -hf_fs = HfFileSystem() -logger = logging.getLogger(__name__) - - -def prune_stream_and_save(input_file: str, output_file: str, keep_columns: list[str]): - """ - Prunes and saves a parquet file by removing un-specified columns. - - Reads the input parquet file in batches, removes columns not in keep_columns, - and writes the result to output_file. Processing in batches avoids memory issues. - - Args: - input_file (str): Path to input parquet file on HuggingFace - output_file (str): Path where pruned parquet file will be saved - keep_columns (list[str]): List of column names to retain - """ - parquet_file = pq.ParquetFile(hf_fs.open(input_file)) - - full_df_list = [] - for batch in tqdm(parquet_file.iter_batches(batch_size=10000), desc=f"Processing {input_file}"): - df = batch.to_pandas() - - drop_columns = [col for col in df.columns if col not in keep_columns] - df = df.drop(columns=drop_columns) - - full_df_list.append(df) - - full_df = pd.concat(full_df_list) - logger.info(f"Saving pruned dataset of shape {full_df.shape} to {output_file}") - full_df.to_parquet(output_file, index=False) - - -def get_file_tasks(hf_path: str, output_path: str, keep_columns: list[str]): - """ - Generate file processing tasks for a HuggingFace subset. - - Args: - hf_path (str): The HuggingFace dataset path to load - output_path (str): The output path to save the pruned dataset - keep_columns (list[str]): The columns to keep in the pruned dataset - - Yields: - Dict with input_file, output_file, and keep_columns for each parquet file - """ - logger.info(f"Loading dataset from {hf_path}") - parquet_list = hf_fs.glob(f"{hf_path}/*.parquet") - - for file in parquet_list: - output_file = os.path.join(output_path, os.path.basename(file)) - yield {"input_file": file, "output_file": output_file, "keep_columns": keep_columns} - - -@dataclass -class DatasetConfig: - hf_repo_id: str - hf_revision: str - hf_paths: list[str] - output_path: str - keep_columns: list[str] - - -def prune_hf_dataset(cfg: DatasetConfig): - logger.info(f"Starting dataset pruning for {cfg.hf_paths}") - - # Build list of subset paths to process - subset_tasks = [] - for path in cfg.hf_paths: - # HF Path form: hf://[][@]/ - hf_path = f"hf://datasets/{cfg.hf_repo_id}@{cfg.hf_revision}/{path}" - logger.info(f"Processing subset {hf_path}") - output_path = os.path.join(cfg.output_path, path) - subset_tasks.append({"hf_path": hf_path, "output_path": output_path}) - - # Build pipeline with nested parallelism: - # - Outer level: process subsets (MAX_CONCURRENT_WORKERS=1) - # - Inner level: process files within each subset - pipeline = ( - Dataset.from_list(subset_tasks) - .flat_map(lambda task: get_file_tasks(task["hf_path"], task["output_path"], cfg.keep_columns)) - .map(lambda task: prune_stream_and_save(task["input_file"], task["output_file"], cfg.keep_columns)) - ) - - logger.info("Executing pipeline") - ctx = ZephyrContext(name="hf-remove-columns") - ctx.execute(pipeline) - logger.info("Successfully processed all subsets") +from marin.datakit.download.stream_remove_columns import DatasetConfig as DatasetConfig +from marin.datakit.download.stream_remove_columns import get_file_tasks as get_file_tasks +from marin.datakit.download.stream_remove_columns import hf_fs as hf_fs +from marin.datakit.download.stream_remove_columns import prune_hf_dataset as prune_hf_dataset +from marin.datakit.download.stream_remove_columns import prune_stream_and_save as prune_stream_and_save diff --git a/lib/marin/src/marin/download/huggingface/upload_gcs_to_hf.py b/lib/marin/src/marin/download/huggingface/upload_gcs_to_hf.py index 1aa580c618..43c368f5b9 100644 --- a/lib/marin/src/marin/download/huggingface/upload_gcs_to_hf.py +++ b/lib/marin/src/marin/download/huggingface/upload_gcs_to_hf.py @@ -1,364 +1,10 @@ # Copyright The Marin Authors # SPDX-License-Identifier: Apache-2.0 +# Backward-compat shim. Canonical location: marin.datakit.download.upload_gcs_to_hf -""" -Upload GCS to Hugging Face (HF) Script - -This script transfers model checkpoints or other content from Google Cloud Storage (GCS) -to Hugging Face repositories. It handles: -- Finding checkpoint directories in GCS buckets -- Downloading the content locally (to a temporary directory) -- Uploading to a specified Hugging Face repository with appropriate versioning -- Supporting dry-run mode to preview what would be uploaded - -Usage as a script: - python upload_gcs_to_hf.py --repo-id="organization/model-name" [--dry-run] [--directory="gs://bucket/path"] - -Usage as an ExecutorStep: - upload_step = ExecutorStep( - name="upload_model_to_hf", - fn=upload_gcs_to_hf, - config=UploadConfig( - hf_repo_id="organization/model-name", - gcs_directories=["gs://bucket/path/to/model"], - dry_run=False - ) - ) -""" - -import argparse -import logging -import os -import re -import subprocess -import tempfile -from dataclasses import dataclass, field - -from google.cloud import storage -from google.cloud.storage import transfer_manager -from huggingface_hub import HfApi, create_repo -from iris.logging import configure_logging - -# Set up logging -logger = logging.getLogger(__name__) - - -@dataclass -class UploadConfig: - """Configuration for uploading from GCS to Hugging Face.""" - - hf_repo_id: str - gcs_directories: list[str] = field(default_factory=list) - dry_run: bool = False - wait_for_completion: bool = True # Added for compatibility with other configs - - -# Default GCS directories to check if none specified -DEFAULT_GCS_DIRS = [ - "gs://marin-eu-west4/checkpoints/llama-8b-tootsie-0.001-19ad63/hf/", - "gs://marin-us-central2/checkpoints/llama-8b-tootsie-phase2/hf/", - "gs://marin-us-central2/checkpoints/llama-8b-tootsie-phase3/hf/", - "gs://marin-us-central2/checkpoints/tootsie-8b-soft-raccoon-3/hf/", - "gs://marin-us-central2/checkpoints/llama-8b-tootsie-adept-phoenix/hf/", - "gs://marin-us-central2/checkpoints/tootsie-8b-sensible-starling/hf/", - "gs://marin-us-central1/checkpoints/tootsie-8b-deeper-starling/hf/", -] - - -def list_gcs_directories(gcs_path: str) -> list[tuple[str, int]]: - """List subdirectories by examining full blob paths.""" - if not gcs_path.startswith("gs://"): - raise ValueError(f"Invalid GCS path: {gcs_path}") - - path = gcs_path[5:] # Remove "gs://" - bucket_name = path.split("/")[0] - prefix = "/".join(path.split("/")[1:]) - - logger.info(f"Checking: {gcs_path}") - - # Get the bucket - client = storage.Client() - bucket = client.bucket(bucket_name) - - # List blobs with this prefix (without delimiter to get all) - blobs = bucket.list_blobs(prefix=prefix) - - # Extract potential directories from blob paths - directories = set() - step_pattern = re.compile(r"step-\d+") - - for blob in blobs: - # Remove the prefix to get the relative path - relative_path = blob.name[len(prefix) :] - - # Skip if there's no relative path - if not relative_path: - continue - - # Extract the first directory level - parts = relative_path.strip("/").split("/") - if parts: - first_dir = parts[0] - - # Check if it's a step directory - if step_pattern.match(first_dir): - directories.add(first_dir) - - # Process the directories we found - step_dirs_local = [] - for dir_name in directories: - if step_pattern.match(dir_name): - try: - step_number = int(dir_name.split("-")[1]) - full_path = f"{gcs_path}{dir_name}/" - step_dirs_local.append((full_path, step_number)) - logger.info(f"Found step directory: {full_path} with step {step_number}") - except (IndexError, ValueError) as e: - logger.error(f"Error parsing step number from {dir_name}: {e}") - - logger.info(f"Found {len(step_dirs_local)} step directories in {gcs_path}") - return step_dirs_local - - -def download_from_gcs(gcs_path: str, local_path: str) -> bool: - """Download contents from a GCS path to a local directory using the GCS transfer manager.""" - logger.info(f"Downloading {gcs_path} to {local_path}...") - - # Parse the GCS path (format: gs://bucket-name/path/to/files) - if not gcs_path.startswith("gs://"): - logger.error(f"Invalid GCS path format: {gcs_path}") - return False - - bucket_name = gcs_path[5:].split("/")[0] - prefix = "/".join(gcs_path[5:].split("/")[1:]) - - # Handle wildcard at the end (the original had f"{gcs_path}*") - if prefix.endswith("*"): - prefix = prefix[:-1] - - # Initialize the GCS client - client = storage.Client() - bucket = client.bucket(bucket_name) - - # List all matching blobs - blobs = list(bucket.list_blobs(prefix=prefix)) - - if not blobs: - logger.error(f"No files found in {gcs_path}") - return False - - total_files = len(blobs) - logger.info(f"Found {total_files} files to download from {gcs_path}") - - # Get the blob names to download (excluding directory placeholders) - blob_names = [] - for blob in blobs: - if not blob.name.endswith("/"): - blob_names.append(blob.name) - - if len(blob_names) < total_files: - logger.info(f"Filtered out {total_files - len(blob_names)} directory markers") - - # Ensure local directory exists - os.makedirs(local_path, exist_ok=True) - - # Log the first few blob names to debug issues - if blob_names: - logger.info(f"Sample blob names (first 3): {', '.join(blob_names[:3])}") - - # Use transfer manager to download all blobs in parallel - logger.info(f"Starting parallel download of {len(blob_names)} files...") - - transfer_manager.download_many_to_path( - bucket=bucket, - blob_names=blob_names, - destination_directory=local_path, - max_workers=8, - create_directories=True, - worker_type="process", - raise_exception=True, - ) - - logger.info(f"Download completed successfully. Downloaded {len(blob_names)} files.") - return True - - -def checkpoint_exists(repo_id: str, step: int, version_name: str) -> bool: - """Check if a specific revision exists in a Hugging Face repository.""" - try: - api = HfApi() - commits = api.list_repo_commits(repo_id=repo_id) - for commit in commits: - if f"step {step}" in commit.title: - return True - return False - except Exception: - return False - - -def extract_version_from_path(gcs_path: str) -> str: - """Extract the version name from a GCS path.""" - # Extract model name from path like "gs://marin-eu-west4/checkpoints/llama-8b-tootsie-0.001-19ad63/hf/" - parts = gcs_path.strip("/").split("/") - return parts[-3] - - -def upload_to_huggingface(local_path: str, repo_id: str, step: int, version_name: str) -> bool: - """Upload a local directory to Hugging Face as a specific revision.""" - logger.info(f"Uploading checkpoint {version_name}, step {step} to Hugging Face") - - # Check if repo exists, create if not - api = HfApi() - create_repo(repo_id=repo_id, exist_ok=True) - # Upload the directory - result = api.upload_folder( - folder_path=local_path, - repo_id=repo_id, - commit_message=f"Upload checkpoint for step {step} ({version_name})", - ) - try: - api.delete_tag(repo_id=repo_id, tag=version_name) - except Exception: - logger.info("Creating tag for the first time") - api.create_tag(repo_id=repo_id, tag=version_name) - logger.info("Upload completed successfully.") - logger.info(f"Commit URL: {result.commit_url}") - return True - - -def upload_gcs_to_hf(cfg: UploadConfig) -> None: - """Main function to upload model checkpoints from GCS to Hugging Face.""" - - configure_logging(level=logging.INFO) - - # Collect all step directories - all_step_dirs = [] - - # Determine which directories to process - directories_to_process = cfg.gcs_directories if cfg.gcs_directories else DEFAULT_GCS_DIRS - - # Process each directory - for directory in directories_to_process: - try: - step_dirs = list_gcs_directories(directory) - all_step_dirs.extend(step_dirs) - except Exception as e: - logger.error(f"Error listing {directory}: {e}") - - # Sort all step directories by step number - if all_step_dirs: - all_step_dirs.sort(key=lambda x: x[1]) - - # Print sorted step directories - logger.info("\nAll step directories sorted by step number:") - logger.info("-" * 50) - for full_path, _step_number in all_step_dirs: - logger.info(f"- {full_path}") - - logger.info(f"\nTotal: {len(all_step_dirs)} step directories") - - # Upload to Hugging Face - if not cfg.dry_run: - logger.info(f"\nUploading to Hugging Face repo: {cfg.hf_repo_id}") - - for full_path, step_number in all_step_dirs: - # Extract version name from the path - version_name = extract_version_from_path(full_path) - - # Check if this checkpoint already exists - if checkpoint_exists(cfg.hf_repo_id, step_number, version_name): - logger.info( - f"Step {step_number} for {version_name} already exists in HF repo {cfg.hf_repo_id}, skipping" - ) - continue - - # Create a temporary directory for downloading - with tempfile.TemporaryDirectory() as temp_dir: - logger.info(f"\nProcessing step {step_number} from {full_path} ({version_name})") - - # Download from GCS - if download_from_gcs(full_path, temp_dir): - # Upload to HF - if upload_to_huggingface(temp_dir, cfg.hf_repo_id, step_number, version_name): - logger.info( - f"Successfully uploaded step {step_number} ({version_name}) to HF repo {cfg.hf_repo_id}" - ) - else: - logger.error(f"Failed to upload step {step_number}") - else: - logger.error(f"Failed to download step {step_number}") - - logger.info("\nUpload process completed.") - else: - logger.info("\nDry run - showing what would be uploaded:") - logger.info("-" * 50) - - for i, (full_path, step_number) in enumerate(all_step_dirs): - version_name = extract_version_from_path(full_path) - logger.info(f"\nCheckpoint {i + 1}/{len(all_step_dirs)}:") - logger.info(f" Source: {full_path}") - logger.info(f" Target repo: {cfg.hf_repo_id}") - logger.info(f" Revision: {version_name}") - logger.info(f" Commit message: Upload checkpoint for step {step_number} ({version_name})") - - # Try to estimate what files would be uploaded - try: - # Use gsutil to list files in the directory - cmd = ["gsutil", "ls", f"{full_path}"] - result = subprocess.run(cmd, capture_output=True, text=True) - if result.returncode == 0: - files = result.stdout.strip().split("\n") - # Filter out empty strings and limit to 5 for display - files = [f for f in files if f] - - if files: - logger.info( - f" Example files that would be uploaded ({min(len(files), 5)} of {len(files)}):" - ) - for file in files[:5]: - logger.info(f" - {os.path.basename(file)}") - if len(files) > 5: - logger.info(f" - ... and {len(files) - 5} more") - except Exception as e: - logger.error(f" Could not list files: {e}") - - logger.info("\nDry run completed - no actual uploads performed.") - else: - logger.warning("\nNo step directories found in any of the paths.") - logger.warning("You might want to check if:") - logger.warning("1. The paths are correct") - logger.warning("2. You have permissions to access these buckets") - logger.warning("3. There are step directories in these locations") - - -def main(): - """Command line entry point for direct script usage.""" - parser = argparse.ArgumentParser(description="Upload checkpoints from GCS to Hugging Face") - parser.add_argument( - "--repo-id", required=True, help='Target Hugging Face repository ID (e.g., "username/model-name")' - ) - parser.add_argument("--dry-run", action="store_true", help="Only list checkpoints without uploading") - parser.add_argument( - "--directories", - nargs="+", - help="Process specific GCS directories instead of the built-in list. Multiple directories can be provided.", - ) - args = parser.parse_args() - - # Create config from args - config = UploadConfig( - hf_repo_id=args.repo_id, gcs_directories=args.directories if args.directories else [], dry_run=args.dry_run - ) - - # Check if application default credentials are set - if "GOOGLE_APPLICATION_CREDENTIALS" not in os.environ: - logger.warning("Warning: GOOGLE_APPLICATION_CREDENTIALS environment variable not set.") - logger.warning("Make sure you're authenticated with Google Cloud before running this script.") - logger.warning("You can authenticate using: gcloud auth application-default login") - - # Run the upload function - upload_gcs_to_hf(config) - +from marin.datakit.download.upload_gcs_to_hf import UploadConfig as UploadConfig +from marin.datakit.download.upload_gcs_to_hf import main as main +from marin.datakit.download.upload_gcs_to_hf import upload_gcs_to_hf as upload_gcs_to_hf if __name__ == "__main__": main() diff --git a/lib/marin/src/marin/download/nemotron_cc/__init__.py b/lib/marin/src/marin/download/nemotron_cc/__init__.py new file mode 100644 index 0000000000..ec8bc038b7 --- /dev/null +++ b/lib/marin/src/marin/download/nemotron_cc/__init__.py @@ -0,0 +1,2 @@ +# Copyright The Marin Authors +# SPDX-License-Identifier: Apache-2.0 diff --git a/lib/marin/src/marin/download/nemotron_cc/download_nemotron_cc.py b/lib/marin/src/marin/download/nemotron_cc/download_nemotron_cc.py index 77c9d82cf5..81251cb66c 100644 --- a/lib/marin/src/marin/download/nemotron_cc/download_nemotron_cc.py +++ b/lib/marin/src/marin/download/nemotron_cc/download_nemotron_cc.py @@ -1,119 +1,7 @@ # Copyright The Marin Authors # SPDX-License-Identifier: Apache-2.0 +# Backward-compat shim. Canonical location: marin.datakit.download.nemotron_cc -""" -Download and process Nemotron-CC dataset from Common Crawl. - -Example Usage: -uv run zephyr --backend=ray --max-parallelism=100 --memory=4GB \ - lib/marin/src/marin/download/nemotron_cc/download_nemotron_cc.py \ - --output_path gs://bucket/nemotron-output -""" - -import json -import logging -import os -from collections.abc import Iterator -from dataclasses import dataclass - -import requests -import zstandard -from iris.marin_fs import open_url -from marin.execution import THIS_OUTPUT_PATH -from marin.utils import fsspec_exists -from requests.adapters import HTTPAdapter -from urllib3.util import Retry -from zephyr import Dataset, ZephyrContext -from zephyr.writers import atomic_rename - -logger = logging.getLogger(__name__) - -myagent = "marin-nemotron-ingress/1.0" -NCC_PATH_FILE_URL = "https://data.commoncrawl.org/contrib/Nemotron/Nemotron-CC/data-jsonl.paths.gz" - - -def _iter_jsonl_from_zstd_stream(raw_stream) -> Iterator[dict]: - """Yield parsed JSON objects from a zstd-compressed JSONL stream.""" - dctx = zstandard.ZstdDecompressor() - with dctx.stream_reader(raw_stream) as reader: - buf = bytearray() - while True: - chunk = reader.read(1048576) - if not chunk: - break - buf.extend(chunk) - while True: - newline_pos = buf.find(b"\n") - if newline_pos < 0: - break - line_bytes = bytes(buf[:newline_pos]) - del buf[: newline_pos + 1] - if not line_bytes.strip(): - continue - yield json.loads(line_bytes) - - -def download_single_nemotron_path(input_file_path: str, output_file_path: str) -> dict: - """Fetches content from a Common Crawl path, streaming records to zstd output.""" - cc_url = f"https://data.commoncrawl.org/{input_file_path}" - logger.info(f"Downloading Nemotron CC file {cc_url} to {output_file_path}") - - session = requests.Session() - retries = Retry(total=5, backoff_factor=1.0, status_forcelist=[500, 502, 503, 504], allowed_methods=["GET"]) - adapter = HTTPAdapter(max_retries=retries) - session.mount("https://", adapter) - session.mount("http://", adapter) - - response = session.get(cc_url, headers={"user-agent": myagent}, stream=True) - response.raise_for_status() - - num_records = 0 - with atomic_rename(output_file_path) as temp_path: - with open_url(temp_path, "w", compression="zstd") as out: - for record in _iter_jsonl_from_zstd_stream(response.raw): - dolma_record = { - "id": record["warc_record_id"], - "text": record["text"], - "source": "nemotron", - "format": "text", - "metadata": {f"nemotron_{k}": v for k, v in record.items() if k not in ("warc_record_id", "text")}, - } - print(json.dumps(dolma_record), file=out) - num_records += 1 - - return {"input_file": input_file_path, "output_file": output_file_path, "num_records": num_records} - - -@dataclass -class NemotronIngressConfig: - output_path: str = THIS_OUTPUT_PATH - - -def download_nemotron_cc(cfg: NemotronIngressConfig): - paths_file_path = os.path.join(cfg.output_path, "data-jsonl.paths") - logger.info(f"Downloading Nemotron CC path file {paths_file_path}") - - with open_url(NCC_PATH_FILE_URL, "rb") as f, open_url(paths_file_path, "wb") as f_out: - f_out.write(f.read()) - - logger.info(f"Reading paths from {paths_file_path}") - all_files = [] - with open_url(paths_file_path, "r", compression="gzip") as f: - for line in f: - file = line.strip() - output_file_path = os.path.join(cfg.output_path, file).replace("jsonl.zstd", "jsonl.zst") - all_files.append((file, output_file_path)) - - logger.info(f"Processing {len(all_files)} Nemotron CC files") - - pipeline = ( - Dataset.from_list(all_files) - .filter(lambda file_info: not fsspec_exists(file_info[1])) - .map(lambda file_info: download_single_nemotron_path(*file_info)) - .write_jsonl(os.path.join(cfg.output_path, ".metrics/download-{shard:05d}.jsonl"), skip_existing=True) - ) - - ctx = ZephyrContext(name="download-nemotron-cc") - ctx.execute(pipeline) - - logger.info(f"Downloaded Nemotron CC files to {cfg.output_path}") +from marin.datakit.download.nemotron_cc import NemotronIngressConfig as NemotronIngressConfig +from marin.datakit.download.nemotron_cc import download_nemotron_cc as download_nemotron_cc +from marin.datakit.download.nemotron_cc import download_single_nemotron_path as download_single_nemotron_path diff --git a/lib/marin/src/marin/download/uncheatable_eval/__init__.py b/lib/marin/src/marin/download/uncheatable_eval/__init__.py new file mode 100644 index 0000000000..ec8bc038b7 --- /dev/null +++ b/lib/marin/src/marin/download/uncheatable_eval/__init__.py @@ -0,0 +1,2 @@ +# Copyright The Marin Authors +# SPDX-License-Identifier: Apache-2.0 diff --git a/lib/marin/src/marin/download/uncheatable_eval/download.py b/lib/marin/src/marin/download/uncheatable_eval/download.py index b77195ed63..9baf9db8ad 100644 --- a/lib/marin/src/marin/download/uncheatable_eval/download.py +++ b/lib/marin/src/marin/download/uncheatable_eval/download.py @@ -1,394 +1,12 @@ # Copyright The Marin Authors # SPDX-License-Identifier: Apache-2.0 +# Backward-compat shim. Canonical location: marin.datakit.download.uncheatable_eval -"""Download and normalize the latest Uncheatable Eval data dumps.""" - -from __future__ import annotations - -import json -import logging -import os -import posixpath -import re -from collections.abc import Iterable -from dataclasses import dataclass -from typing import Any - -import requests -from iris.marin_fs import open_url -from marin.execution import THIS_OUTPUT_PATH, ExecutorStep, VersionedValue, ensure_versioned, this_output_path -from marin.utils import fsspec_mkdirs -from requests.adapters import HTTPAdapter -from urllib3.util import Retry -from zephyr import Dataset, ZephyrContext -from zephyr.writers import atomic_rename - -logger = logging.getLogger(__name__) - -FILENAME_PATTERN = re.compile(r"^(?P.+)_(?P\d{8})to(?P\d{8})(?P(?:\.[^.]+)*)$") - -TEXT_FIELD_CANDIDATES: tuple[str, ...] = ( - "text", - "body", - "content", - "article", - "document", - "raw_text", - "code", - "message", - "description", - "story", -) - -LIST_FIELD_CANDIDATES: tuple[str, ...] = ( - "paragraphs", - "sentences", - "lines", - "messages", +from marin.datakit.download.uncheatable_eval import UncheatableEvalDataset as UncheatableEvalDataset +from marin.datakit.download.uncheatable_eval import ( + UncheatableEvalDownloadConfig as UncheatableEvalDownloadConfig, ) - -ID_FIELD_CANDIDATES: tuple[str, ...] = ( - "id", - "uuid", - "guid", - "doc_id", - "document_id", - "article_id", - "hash", - "sha", - "uid", +from marin.datakit.download.uncheatable_eval import ( + download_latest_uncheatable_eval as download_latest_uncheatable_eval, ) - - -@dataclass(frozen=True) -class UncheatableEvalDataset: - """Information about a single data dump file from the Uncheatable Eval repository.""" - - benchmark: str - start_date: str - end_date: str - name: str - download_url: str - sha: str | None = None - size: int | None = None - - @property - def date_range(self) -> str: - return f"{self.start_date}to{self.end_date}" - - @property - def source_label(self) -> str: - return f"{self.benchmark}:{self.date_range}" - - def output_filename(self, suffix: str = ".jsonl.gz") -> str: - return f"{self.benchmark}_{self.date_range}{suffix}" - - -@dataclass -class UncheatableEvalDownloadConfig: - """Configuration for downloading and normalizing Uncheatable Eval dumps.""" - - output_path: str | VersionedValue[str] = THIS_OUTPUT_PATH - repo_owner: str | VersionedValue[str] = "Jellyfish042" - repo_name: str | VersionedValue[str] = "uncheatable_eval" - data_path: str | VersionedValue[str] = "data" - branch: str | VersionedValue[str] = "master" - max_concurrent_downloads: int = 8 - request_timeout: int = 120 - github_token: str | None = None - skip_existing: bool = True - metadata_filename: str = "metadata.json" - - -def _http_headers(cfg: UncheatableEvalDownloadConfig) -> dict[str, str]: - headers = {"Accept": "application/vnd.github+json"} - token = cfg.github_token or os.environ.get("GITHUB_TOKEN") - if token: - headers["Authorization"] = f"Bearer {token}" - return headers - - -def _fetch_directory_listing(cfg: UncheatableEvalDownloadConfig) -> list[dict[str, Any]]: - """Return the list of files in the configured GitHub repository directory.""" - - headers = _http_headers(cfg) - base_url = f"https://api.github.com/repos/{cfg.repo_owner!s}/{cfg.repo_name!s}/contents/{cfg.data_path!s}" - params = {"ref": str(cfg.branch)} - response = requests.get(base_url, headers=headers, params=params, timeout=cfg.request_timeout) - response.raise_for_status() - payload = response.json() - if not isinstance(payload, list): - raise ValueError(f"Unexpected response from GitHub API: {payload!r}") - return payload - - -def _parse_available_dumps(entries: Iterable[dict[str, Any]]) -> list[UncheatableEvalDataset]: - """Parse GitHub directory entries into dataset metadata.""" - - datasets: list[UncheatableEvalDataset] = [] - for entry in entries: - name = entry.get("name") - if not isinstance(name, str): - continue - match = FILENAME_PATTERN.match(name) - if not match: - continue - benchmark = match.group("benchmark") - start = match.group("start") - end = match.group("end") - download_url = entry.get("download_url") - if not isinstance(download_url, str): - logger.debug("Skipping %s because it has no download_url", name) - continue - datasets.append( - UncheatableEvalDataset( - benchmark=benchmark, - start_date=start, - end_date=end, - name=name, - download_url=download_url, - sha=entry.get("sha"), - size=entry.get("size"), - ) - ) - return datasets - - -def _select_latest_dumps(datasets: Iterable[UncheatableEvalDataset]) -> list[UncheatableEvalDataset]: - """Select the latest dump for each benchmark based on the end date (and start date as tie breaker).""" - - latest: dict[str, UncheatableEvalDataset] = {} - for dataset in datasets: - existing = latest.get(dataset.benchmark) - if existing is None: - latest[dataset.benchmark] = dataset - continue - candidate_key = (dataset.end_date, dataset.start_date, dataset.name) - existing_key = (existing.end_date, existing.start_date, existing.name) - if candidate_key > existing_key: - latest[dataset.benchmark] = dataset - return sorted(latest.values(), key=lambda d: d.benchmark) - - -def _extract_id(raw: Any, dataset: UncheatableEvalDataset, index: int) -> str: - if isinstance(raw, dict): - for key in ID_FIELD_CANDIDATES: - value = raw.get(key) - if value: - return str(value) - metadata = raw.get("metadata") - if isinstance(metadata, dict): - for key in ID_FIELD_CANDIDATES: - value = metadata.get(key) - if value: - return str(value) - return f"{dataset.benchmark}_{dataset.date_range}_{index:06d}" - - -def _join_list_field(value: Any) -> str | None: - if isinstance(value, list): - text_items = [str(item) for item in value if item is not None] - if text_items: - return "\n".join(text_items) - return None - - -def _extract_text(raw: Any) -> str | None: - if raw is None: - return None - if isinstance(raw, str): - return raw - if isinstance(raw, dict): - for key in TEXT_FIELD_CANDIDATES: - value = raw.get(key) - if isinstance(value, str) and value.strip(): - return value - for key in TEXT_FIELD_CANDIDATES: - value = raw.get(key) - joined = _join_list_field(value) - if joined: - return joined - for key in LIST_FIELD_CANDIDATES: - joined = _join_list_field(raw.get(key)) - if joined: - return joined - title = raw.get("title") - body = raw.get("body") - if isinstance(title, str) and isinstance(body, str): - combined = f"{title.strip()}\n\n{body.strip()}" - if combined.strip(): - return combined - if isinstance(title, str) and title.strip(): - return title - return json.dumps(raw, ensure_ascii=False) - return str(raw) - - -def _normalize_record(raw: Any, dataset: UncheatableEvalDataset, index: int) -> dict[str, str]: - text = _extract_text(raw) - if text is None or not str(text).strip(): - raise ValueError(f"Record {index} in {dataset.name} does not contain text") - record_id = _extract_id(raw, dataset, index) - return {"id": record_id, "text": text, "source": dataset.source_label} - - -def _download_and_convert_single( - task: DownloadTask, -) -> dict[str, Any]: - session = requests.Session() - retries = Retry(total=5, backoff_factor=1.0, status_forcelist=[500, 502, 503, 504], allowed_methods=["GET"]) - adapter = HTTPAdapter(max_retries=retries) - session.mount("https://", adapter) - session.mount("http://", adapter) - - logger.info("Downloading %s from %s", task.dataset.name, task.download_url) - response = session.get(task.download_url, timeout=task.cfg.request_timeout, headers=_http_headers(task.cfg)) - response.raise_for_status() - - try: - payload = response.json() - except ValueError as exc: - raise ValueError(f"Failed to decode JSON payload for {task.dataset.name}") from exc - - if not isinstance(payload, list): - raise ValueError(f"Expected list in dataset {task.dataset.name}, found {type(payload).__name__}") - - fsspec_mkdirs(os.path.dirname(task.output_file_path), exist_ok=True) - - record_count = 0 - with atomic_rename(task.output_file_path) as temp_path: - with open_url(temp_path, "wt", encoding="utf-8", compression="gzip") as outfile: - for index, raw in enumerate(payload): - normalized = _normalize_record(raw, task.dataset, index) - json.dump(normalized, outfile, ensure_ascii=False) - outfile.write("\n") - record_count += 1 - - logger.info("Wrote %s records to %s", record_count, task.output_file_path) - return {"records": record_count, "output_file": task.output_file_path} - - -@dataclass -class DownloadTask: - download_url: str - output_file_path: str - dataset: UncheatableEvalDataset - cfg: UncheatableEvalDownloadConfig - - -def _generate_tasks( - datasets: Iterable[UncheatableEvalDataset], - cfg: UncheatableEvalDownloadConfig, -) -> tuple[list[DownloadTask], list[UncheatableEvalDataset]]: - tasks: list[DownloadTask] = [] - filtered: list[UncheatableEvalDataset] = [] - for dataset in datasets: - output_file = posixpath.join(str(cfg.output_path), dataset.output_filename()) - tasks.append(DownloadTask(dataset.download_url, output_file, dataset, cfg)) - filtered.append(dataset) - return tasks, filtered - - -def _write_metadata(cfg: UncheatableEvalDownloadConfig, records: list[dict[str, Any]]) -> None: - if not records: - return - metadata_path = posixpath.join(str(cfg.output_path), cfg.metadata_filename) - with open_url(metadata_path, "w", encoding="utf-8") as meta_file: - json.dump(records, meta_file, indent=2, ensure_ascii=False) - logger.info("Wrote metadata to %s", metadata_path) - - -def download_latest_uncheatable_eval(cfg: UncheatableEvalDownloadConfig) -> dict[str, Any]: - """Download and normalize the newest Uncheatable Eval dump for each benchmark.""" - - entries = _fetch_directory_listing(cfg) - datasets = _parse_available_dumps(entries) - latest_datasets = _select_latest_dumps(datasets) - - if not latest_datasets: - logger.warning("No datasets found that match the expected naming pattern") - return {"success": False, "reason": "no_datasets"} - - output_path = str(cfg.output_path) - fsspec_mkdirs(output_path, exist_ok=True) - - tasks, filtered_datasets = _generate_tasks(latest_datasets, cfg) - - if not tasks: - logger.info("No new datasets to process") - return {"success": True, "reason": "already_processed", "skipped": True} - - metadata_records: list[dict[str, Any]] = [] - - pipeline = ( - Dataset.from_list(tasks) - .map(lambda task: _download_and_convert_single(task)) - .write_jsonl(f"{cfg.output_path}/.metrics/part-{{shard:05d}}.jsonl", skip_existing=True) - ) - ctx = ZephyrContext(name="download-uncheatable-eval") - output_paths = ctx.execute(pipeline) - - for dataset, metadata_file in zip(filtered_datasets, output_paths, strict=True): - with open_url(metadata_file, "r", encoding="utf-8") as meta_file: - result = json.load(meta_file) - - try: - metadata_records.append( - { - "benchmark": dataset.benchmark, - "start_date": dataset.start_date, - "end_date": dataset.end_date, - "source": dataset.source_label, - "output_file": posixpath.join(output_path, dataset.output_filename()), - "records": result.get("records"), - "sha": dataset.sha, - "size": dataset.size, - } - ) - except Exception: - logger.exception("Failed to process dataset %s", dataset.name) - raise - - _write_metadata(cfg, metadata_records) - return {"success": True, "processed": metadata_records} - - -def make_uncheatable_eval_step( - *, - name: str = "raw/uncheatable-eval/latest", - repo_owner: str = "ziqing-huang", - repo_name: str = "uncheatable_eval", - data_path: str = "data", - branch: str = "master", - max_concurrent_downloads: int = 8, - request_timeout: int = 120, - github_token: str | None = None, - skip_existing: bool = True, -) -> ExecutorStep[UncheatableEvalDownloadConfig]: - """Create an :class:`ExecutorStep` that downloads the latest Uncheatable Eval dumps.""" - - config = UncheatableEvalDownloadConfig( - output_path=this_output_path(), - repo_owner=ensure_versioned(repo_owner), - repo_name=ensure_versioned(repo_name), - data_path=ensure_versioned(data_path), - branch=ensure_versioned(branch), - max_concurrent_downloads=max_concurrent_downloads, - request_timeout=request_timeout, - github_token=github_token, - skip_existing=skip_existing, - ) - - return ExecutorStep( - name=name, - fn=download_latest_uncheatable_eval, - config=config, - ) - - -__all__ = [ - "UncheatableEvalDataset", - "UncheatableEvalDownloadConfig", - "download_latest_uncheatable_eval", - "make_uncheatable_eval_step", -] +from marin.datakit.download.uncheatable_eval import make_uncheatable_eval_step as make_uncheatable_eval_step diff --git a/lib/marin/src/marin/download/wikipedia/__init__.py b/lib/marin/src/marin/download/wikipedia/__init__.py new file mode 100644 index 0000000000..ec8bc038b7 --- /dev/null +++ b/lib/marin/src/marin/download/wikipedia/__init__.py @@ -0,0 +1,2 @@ +# Copyright The Marin Authors +# SPDX-License-Identifier: Apache-2.0 diff --git a/lib/marin/src/marin/download/wikipedia/download.py b/lib/marin/src/marin/download/wikipedia/download.py index 552e546bf9..9b50143040 100644 --- a/lib/marin/src/marin/download/wikipedia/download.py +++ b/lib/marin/src/marin/download/wikipedia/download.py @@ -1,125 +1,7 @@ # Copyright The Marin Authors # SPDX-License-Identifier: Apache-2.0 +# Backward-compat shim. Canonical location: marin.datakit.download.wikipedia -""" -wikipedia/download.py - -Download script for the Wikipedia raw HTML data, provided by Wikimedia. - -Home Page: https://dumps.wikimedia.org/other/enterprise_html/runs/ - -Example Usage (production, large dataset): -ENWIKI=https://dumps.wikimedia.org/other/enterprise_html/runs/20250320/enwiki-NS0-20250320-ENTERPRISE-HTML.json.tar.gz -uv run zephyr --backend=ray --max-parallelism=10 \ - lib/marin/src/marin/download/wikipedia/download.py \ - --input_urls $ENWIKI \ - --revision 20250320 --output_path gs://path/to/output - -Example Usage (local testing, small dataset): -SIMPLEWIKI=https://dumps.wikimedia.org/other/enterprise_html/runs/20250320/simplewiki-NS0-20250320-ENTERPRISE-HTML.json.tar.gz -uv run zephyr --backend=threadpool --max-parallelism=4 --entry-point=download \ - lib/marin/src/marin/download/wikipedia/download.py \ - --input_urls "[$SIMPLEWIKI]" \ - --revision 20250320 --output_path /tmp/wikipedia_test - -Note: The enwiki-NS0 file (English Wikipedia, namespace 0 = articles) is approximately 130 GB compressed. - The simplewiki-NS0 file (Simple English Wikipedia) is much smaller at ~2 GB compressed. -""" - -import logging -import os -import tarfile -from collections.abc import Iterable -from dataclasses import dataclass - -import draccus -import requests -from iris.marin_fs import open_url -from marin.utils import fsspec_size -from tqdm_loggable.auto import tqdm -from zephyr import Dataset, ZephyrContext, atomic_rename, load_jsonl - -logger = logging.getLogger(__name__) - - -@dataclass -class DownloadConfig: - input_urls: list[str] - revision: str - output_path: str - - -def download_tar(url: str, output_prefix) -> str: - shard_filename = url.split("/")[-1] - output_filename = os.path.join(output_prefix, shard_filename) - logger.info(f"Downloading URL: {url} to {output_filename}") - - try: - total_size = fsspec_size(url) - pbar = tqdm(total=total_size, desc="Downloading File", unit="B", unit_scale=True) - - with atomic_rename(output_filename) as tmp_filename, open_url(tmp_filename, "wb") as f: - r = requests.get(url, stream=True) - - for chunk in r.raw.stream(20 * 1024 * 1024, decode_content=False): - if chunk: - f.write(chunk) - f.flush() - - pbar.update(len(chunk)) - - return output_filename - except Exception as e: - logger.error(f"Error downloading URL: {url}") - raise e - - -def process_file(input_file: str, output_path: str) -> Iterable[str]: - logger.info(f"Processing file: {input_file}") - logger.info(f"Output path: {output_path}") - - try: - with open_url(input_file) as f: - with tarfile.open(fileobj=f, mode="r:gz") as tr: - for info in tr: - with tr.extractfile(info) as file: - file_content = file.read() - file_path = os.path.join(output_path, info.name + ".gz") - - # Each file is a .ndjson file, which contains about 18k-21k articles - # per file with size ranging from 200MB to 300MB - with ( - atomic_rename(file_path) as tmpfile_path, - open_url(tmpfile_path, "wb", compression="gzip") as output_f, - ): - output_f.write(file_content) - yield file_path - - except Exception as e: - logger.error(f"Error processing file: {input_file}") - raise e - - -@draccus.wrap() -def download(cfg: DownloadConfig) -> None: - """Download and process Wikipedia data.""" - logger.info("Starting transfer of Wikipedia dump...") - output_base = os.path.join(cfg.output_path, cfg.revision) - - ctx = ZephyrContext(name="download-wikipedia") - download_metrics = ctx.execute( - Dataset.from_list(cfg.input_urls) - .map(lambda url: download_tar(url, output_base)) - .write_jsonl(f"{output_base}/.metrics/download-{{shard:05d}}.jsonl", skip_existing=True), - ) - - # load all of the output filenames to process - downloads = ctx.execute(Dataset.from_list(download_metrics).flat_map(load_jsonl)) - - extracted = ctx.execute( - Dataset.from_list(downloads) - .flat_map(lambda file: process_file(file, output_base)) - .write_jsonl(f"{output_base}/.metrics/process-{{shard:05d}}.jsonl", skip_existing=True), - ) - - logger.info("Wikipedia dump transfer complete, wrote: %s", list(extracted)) +from marin.datakit.download.wikipedia import download as download +from marin.datakit.download.wikipedia import download_tar as download_tar +from marin.datakit.download.wikipedia import process_file as process_file From 23bf3c4bb9a91697109a22998b9b8ad59edb23f5 Mon Sep 17 00:00:00 2001 From: Rafal Wojdyla Date: Wed, 25 Mar 2026 10:16:34 -0700 Subject: [PATCH 07/56] Extract pretraining download definitions into datakit/download/pretraining.py Creates canonical StepSpec factory functions for all pretraining dataset downloads (fineweb, dclm, slimpajama, etc.) in pretraining.py. Updates simple.py to import from there and build the backward-compat downloads dict via _build_downloads(). Co-Authored-By: Claude Opus 4.6 (1M context) --- experiments/pretraining_datasets/simple.py | 195 +++++------------- .../src/marin/datakit/download/pretraining.py | 119 +++++++++++ 2 files changed, 167 insertions(+), 147 deletions(-) create mode 100644 lib/marin/src/marin/datakit/download/pretraining.py diff --git a/experiments/pretraining_datasets/simple.py b/experiments/pretraining_datasets/simple.py index 7f51364735..79910f3741 100644 --- a/experiments/pretraining_datasets/simple.py +++ b/experiments/pretraining_datasets/simple.py @@ -12,8 +12,20 @@ from levanter.data.text import TextLmDatasetFormat from levanter.store.cache import CacheOptions -from marin.download.huggingface.download_hf import DownloadConfig, download_hf -from marin.execution.executor import ExecutorStep, this_output_path, versioned +from marin.datakit.download.pretraining import ( + dclm_baseline_download, + dclm_baseline_wrong_download, + dolma3_mix_150b_1025_download, + fineweb_download, + fineweb_edu_download, + proofpile_2_download, + slimpajama_6b_download, + slimpajama_download, + starcoderdata_download, + the_pile_openwebtext2_download, + the_stack_dedup_download, +) +from marin.execution.executor import ExecutorStep, InputName, this_output_path, versioned from marin.processing.tokenize import TokenizeConfig, tokenize from experiments.llama import llama3_tokenizer @@ -25,7 +37,7 @@ def _tokenize_simple( name: str, - raw_dataset: ExecutorStep, + raw_dataset: ExecutorStep | InputName, tokenizer: str | None = None, override_path: str | None = None, text_format: TextLmDatasetFormat = TextLmDatasetFormat(), @@ -57,153 +69,42 @@ def _tokenize_simple( # RAW DATASET DOWNLOADS # ============================================================================ -downloads = { - "fineweb": ExecutorStep( - name="raw/fineweb", - fn=download_hf, - config=DownloadConfig( - hf_dataset_id="HuggingFaceFW/fineweb", - revision="cd85054", - gcs_output_path=this_output_path(), - wait_for_completion=True, + +def _build_downloads() -> dict[str, ExecutorStep | InputName]: + """Build the downloads dict from canonical StepSpec definitions in pretraining.py.""" + fineweb_edu_base = fineweb_edu_download().as_executor_step() + + return { + "fineweb": fineweb_download().as_executor_step(), + "fineweb_edu": fineweb_edu_base.cd("data"), + "fineweb_edu_sample_10bt": fineweb_edu_base.cd("sample/10BT"), + "fineweb_edu_sample_100bt": fineweb_edu_base.cd("sample/100BT"), + "fineweb_edu_sample_350bt": fineweb_edu_base.cd("sample/350BT"), + "slimpajama": ( + slimpajama_download() + .as_executor_step() + .cd("2d0accd/huggingface.co/datasets/cerebras/SlimPajama-627B/resolve/2d0accd") ), - override_output_path="raw/fineweb", - ), - "fineweb_edu": ( - ( - fineweb_edu_base_step := ExecutorStep( - name="raw/fineweb-edu", - fn=download_hf, - config=DownloadConfig( - hf_dataset_id="HuggingFaceFW/fineweb-edu", - revision=versioned((revision := "87f0914")), - gcs_output_path=this_output_path(), - wait_for_completion=True, - ), - override_output_path=f"raw/fineweb-edu-{revision}", - ) - ).cd("data") - ), - "fineweb_edu_sample_10bt": fineweb_edu_base_step.cd("sample/10BT"), - "fineweb_edu_sample_100bt": fineweb_edu_base_step.cd("sample/100BT"), - "fineweb_edu_sample_350bt": fineweb_edu_base_step.cd("sample/350BT"), - "slimpajama": ( - ExecutorStep( - name="raw/SlimPajama-627B", - fn=download_hf, - config=DownloadConfig( - hf_dataset_id="cerebras/SlimPajama-627B", - revision="2d0accd", - gcs_output_path=this_output_path(), - wait_for_completion=True, - ), - override_output_path="raw/SlimPajama-627B-262830", - ).cd("2d0accd/huggingface.co/datasets/cerebras/SlimPajama-627B/resolve/2d0accd") - ), - "slimpajama_6b": ( - ExecutorStep( - name="raw/SlimPajama-6B", - fn=download_hf, - config=DownloadConfig( - hf_dataset_id="DKYoon/SlimPajama-6B", - revision="b5f90f4", - gcs_output_path=this_output_path(), - wait_for_completion=True, - ), - override_output_path="raw/SlimPajama-6B-be35b7", - ).cd("data") - ), - "dolma3_mix_150b_1025": ( - ExecutorStep( - name="raw/dolma3_mix-150B-1025", - fn=download_hf, - config=DownloadConfig( - hf_dataset_id="allenai/dolma3_mix-150B-1025", - revision="15d04ee", - gcs_output_path=this_output_path(), - wait_for_completion=True, - append_sha_to_path=True, - ), - override_output_path="raw/dolma3_mix-150B-1025-15d04ee", - ).cd("15d04ee") - ), - "dclm_baseline_wrong": ExecutorStep( - name="raw/dclm-baseline-1.0", - fn=download_hf, - config=DownloadConfig( - hf_dataset_id="mlfoundations/dclm-baseline-1.0", - revision="a3b142c", - gcs_output_path=this_output_path(), - wait_for_completion=True, + "slimpajama_6b": slimpajama_6b_download().as_executor_step().cd("data"), + "dolma3_mix_150b_1025": dolma3_mix_150b_1025_download().as_executor_step().cd("15d04ee"), + "dclm_baseline_wrong": dclm_baseline_wrong_download().as_executor_step(), + "dclm_baseline": dclm_baseline_download().as_executor_step().cd("a3b142c"), + "the_stack_dedup": the_stack_dedup_download().as_executor_step().cd("17cad72"), + "proofpile_2": ( + proofpile_2_download() + .as_executor_step() + .cd("901a927/huggingface.co/datasets/EleutherAI/proof-pile-2/resolve/901a927") ), - override_output_path="raw/dclm_WRONG_20250211/", - ), - "dclm_baseline": ( - ExecutorStep( - name="raw/dclm-baseline-1.0", - fn=download_hf, - config=DownloadConfig( - hf_dataset_id="mlfoundations/dclm-baseline-1.0", - revision="a3b142c", - gcs_output_path=this_output_path(), - wait_for_completion=True, - ), - override_output_path="raw/dclm", - ).cd("a3b142c") - ), - "the_stack_dedup": ( - ExecutorStep( - name="raw/the-stack-dedup", - fn=download_hf, - config=DownloadConfig( - hf_dataset_id="bigcode/the-stack-dedup", - revision="17cad72", - gcs_output_path=this_output_path(), - wait_for_completion=True, - ), - override_output_path="raw/the-stack-dedup-4ba450", - ).cd("17cad72") - ), - "proofpile_2": ( - ExecutorStep( - name="raw/proof-pile-2", - fn=download_hf, - config=DownloadConfig( - hf_dataset_id="EleutherAI/proof-pile-2", - revision="901a927", - gcs_output_path=this_output_path(), - wait_for_completion=True, - ), - override_output_path="raw/proof-pile-2-f1b1d8", - ).cd("901a927/huggingface.co/datasets/EleutherAI/proof-pile-2/resolve/901a927") - ), - "the_pile_openwebtext2": ( - ExecutorStep( - name="raw/the_pile_openwebtext2", - fn=download_hf, - config=DownloadConfig( - hf_dataset_id="vietgpt/the_pile_openwebtext2", - revision="1de27c6", - gcs_output_path=this_output_path(), - wait_for_completion=True, - ), - override_output_path="raw/the_pile_openwebtext2", - ).cd("1de27c6/huggingface.co/datasets/vietgpt/the_pile_openwebtext2/resolve/1de27c6") - ), - # TODO: Earlier datasets were stored in gcs_output_path/ instead of gcs_output_path. - # Migrate the dataset and cd can be removed. - "starcoderdata": ExecutorStep( - name="raw/starcoderdata", - fn=download_hf, - config=DownloadConfig( - hf_dataset_id="bigcode/starcoderdata", - revision="9fc30b5", - gcs_output_path=this_output_path(), - wait_for_completion=True, + "the_pile_openwebtext2": ( + the_pile_openwebtext2_download() + .as_executor_step() + .cd("1de27c6/huggingface.co/datasets/vietgpt/the_pile_openwebtext2/resolve/1de27c6") ), - override_output_path="raw/starcoderdata-720c8c", - ), -} + "starcoderdata": starcoderdata_download().as_executor_step(), + } + + +downloads = _build_downloads() # ============================================================================ diff --git a/lib/marin/src/marin/datakit/download/pretraining.py b/lib/marin/src/marin/datakit/download/pretraining.py new file mode 100644 index 0000000000..3300820ba3 --- /dev/null +++ b/lib/marin/src/marin/datakit/download/pretraining.py @@ -0,0 +1,119 @@ +# Copyright The Marin Authors +# SPDX-License-Identifier: Apache-2.0 + +"""Pre-defined download steps for common pretraining datasets. + +Each function returns a StepSpec for downloading a specific dataset from +HuggingFace. These are the canonical definitions — experiments should +import from here rather than defining download steps inline. + +For datasets where the actual data lives in a subdirectory of the download +(e.g. fineweb-edu has data under ``data/``), the function returns the +StepSpec for the base download. Consumers that need the subdirectory path +should use ``step.output_path + "/data"`` or convert to ExecutorStep and +use ``.cd("data")``. +""" + +from marin.datakit.download.huggingface import download_hf_step +from marin.execution.step_spec import StepSpec + + +def fineweb_download() -> StepSpec: + return download_hf_step( + "raw/fineweb", + hf_dataset_id="HuggingFaceFW/fineweb", + revision="cd85054", + override_output_path="raw/fineweb", + ) + + +def fineweb_edu_download() -> StepSpec: + """Base download for fineweb-edu. Data is under the ``data/`` subdirectory.""" + return download_hf_step( + "raw/fineweb-edu", + hf_dataset_id="HuggingFaceFW/fineweb-edu", + revision="87f0914", + override_output_path="raw/fineweb-edu-87f0914", + ) + + +def slimpajama_download() -> StepSpec: + return download_hf_step( + "raw/SlimPajama-627B", + hf_dataset_id="cerebras/SlimPajama-627B", + revision="2d0accd", + override_output_path="raw/SlimPajama-627B-262830", + ) + + +def slimpajama_6b_download() -> StepSpec: + return download_hf_step( + "raw/SlimPajama-6B", + hf_dataset_id="DKYoon/SlimPajama-6B", + revision="b5f90f4", + override_output_path="raw/SlimPajama-6B-be35b7", + ) + + +def dolma3_mix_150b_1025_download() -> StepSpec: + return download_hf_step( + "raw/dolma3_mix-150B-1025", + hf_dataset_id="allenai/dolma3_mix-150B-1025", + revision="15d04ee", + override_output_path="raw/dolma3_mix-150B-1025-15d04ee", + ) + + +def dclm_baseline_download() -> StepSpec: + return download_hf_step( + "raw/dclm-baseline-1.0", + hf_dataset_id="mlfoundations/dclm-baseline-1.0", + revision="a3b142c", + override_output_path="raw/dclm", + ) + + +def the_stack_dedup_download() -> StepSpec: + return download_hf_step( + "raw/the-stack-dedup", + hf_dataset_id="bigcode/the-stack-dedup", + revision="17cad72", + override_output_path="raw/the-stack-dedup-4ba450", + ) + + +def proofpile_2_download() -> StepSpec: + return download_hf_step( + "raw/proof-pile-2", + hf_dataset_id="EleutherAI/proof-pile-2", + revision="901a927", + override_output_path="raw/proof-pile-2-f1b1d8", + ) + + +def the_pile_openwebtext2_download() -> StepSpec: + return download_hf_step( + "raw/the_pile_openwebtext2", + hf_dataset_id="vietgpt/the_pile_openwebtext2", + revision="1de27c6", + override_output_path="raw/the_pile_openwebtext2", + ) + + +def starcoderdata_download() -> StepSpec: + return download_hf_step( + "raw/starcoderdata", + hf_dataset_id="bigcode/starcoderdata", + revision="9fc30b5", + override_output_path="raw/starcoderdata-720c8c", + ) + + +def dclm_baseline_wrong_download() -> StepSpec: + """Legacy download with incorrect path. Kept for backward compat.""" + return download_hf_step( + "raw/dclm-baseline-1.0", + hf_dataset_id="mlfoundations/dclm-baseline-1.0", + revision="a3b142c", + override_output_path="raw/dclm_WRONG_20250211/", + ) From 6ffda47a494173c7a5afc4b034b9af6b76525e02 Mon Sep 17 00:00:00 2001 From: Rafal Wojdyla Date: Wed, 25 Mar 2026 10:32:44 -0700 Subject: [PATCH 08/56] Fix mock targets in download tests to use canonical module paths Update mock/patch targets in test_huggingface.py, test_nemotron_cc.py, and test_dclm_hq.py to point at the canonical marin.datakit.download.* locations. Add _relative_path_in_source to the HF download shim since the test imports it. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../src/marin/download/huggingface/download_hf.py | 1 + tests/download/test_dclm_hq.py | 2 +- tests/download/test_huggingface.py | 14 +++++++------- tests/download/test_nemotron_cc.py | 4 ++-- 4 files changed, 11 insertions(+), 10 deletions(-) diff --git a/lib/marin/src/marin/download/huggingface/download_hf.py b/lib/marin/src/marin/download/huggingface/download_hf.py index 9912a5d2c0..2dd0177806 100644 --- a/lib/marin/src/marin/download/huggingface/download_hf.py +++ b/lib/marin/src/marin/download/huggingface/download_hf.py @@ -3,6 +3,7 @@ # Backward-compat shim. Canonical location: marin.datakit.download.huggingface from marin.datakit.download.huggingface import DownloadConfig as DownloadConfig +from marin.datakit.download.huggingface import _relative_path_in_source as _relative_path_in_source from marin.datakit.download.huggingface import download_hf as download_hf from marin.datakit.download.huggingface import ensure_fsspec_path_writable as ensure_fsspec_path_writable from marin.datakit.download.huggingface import main as main diff --git a/tests/download/test_dclm_hq.py b/tests/download/test_dclm_hq.py index 1636f3c34d..21ec33c5b7 100644 --- a/tests/download/test_dclm_hq.py +++ b/tests/download/test_dclm_hq.py @@ -170,7 +170,7 @@ def mock_requests_get(url, **kwargs): raise ValueError(f"Unexpected URL: {url}") - with patch("marin.download.dclm_hq.download_dclm_hq_html.requests.get", side_effect=mock_requests_get): + with patch("marin.datakit.download.dclm_hq.requests.get", side_effect=mock_requests_get): cfg = DCLMHQDownloadConfig(input_path=str(tmp_path / "input"), output_path=str(output_dir)) extract_dclm_hq_dump(cfg) diff --git a/tests/download/test_huggingface.py b/tests/download/test_huggingface.py index 1019c83633..24a5bc6169 100644 --- a/tests/download/test_huggingface.py +++ b/tests/download/test_huggingface.py @@ -10,13 +10,13 @@ import pandas as pd import pytest -from marin.download.huggingface.download_hf import ( +from marin.datakit.download.huggingface import ( DownloadConfig, _relative_path_in_source, download_hf, stream_file_to_fsspec, ) -from marin.download.huggingface.stream_remove_columns import ( +from marin.datakit.download.stream_remove_columns import ( DatasetConfig, prune_hf_dataset, ) @@ -81,7 +81,7 @@ def test_download_hf_basic(mock_hf_fs, tmp_path): ) # Mock HfFileSystem creation - with patch("marin.download.huggingface.download_hf.HfFileSystem", return_value=hf_fs): + with patch("marin.datakit.download.huggingface.HfFileSystem", return_value=hf_fs): download_hf(cfg) # Verify files were downloaded @@ -123,7 +123,7 @@ def test_download_hf_appends_sha_when_configured(mock_hf_fs, tmp_path): append_sha_to_path=True, ) - with patch("marin.download.huggingface.download_hf.HfFileSystem", return_value=hf_fs): + with patch("marin.datakit.download.huggingface.HfFileSystem", return_value=hf_fs): download_hf(cfg) target_output = base_output_path / revision @@ -189,7 +189,7 @@ def create_buffer(): mock_fs.glob = Mock(return_value=["hf://datasets/test-org/test-dataset@main/data/file.parquet"]) mock_fs.open = Mock(side_effect=lambda path, mode="rb": create_buffer()) - with patch("marin.download.huggingface.stream_remove_columns.hf_fs", mock_fs): + with patch("marin.datakit.download.stream_remove_columns.hf_fs", mock_fs): prune_hf_dataset(cfg) # Verify output @@ -229,8 +229,8 @@ def read(self, chunk_size): hf_fs.open.side_effect = lambda path, mode="rb", **_kwargs: FlakyReader() with ( - patch("marin.download.huggingface.download_hf.HfFileSystem", return_value=hf_fs), - patch("marin.download.huggingface.download_hf.time.sleep", return_value=None), + patch("marin.datakit.download.huggingface.HfFileSystem", return_value=hf_fs), + patch("marin.datakit.download.huggingface.time.sleep", return_value=None), ): result = stream_file_to_fsspec( str(output_path), diff --git a/tests/download/test_nemotron_cc.py b/tests/download/test_nemotron_cc.py index 6f3bdff56c..04ac04e2af 100644 --- a/tests/download/test_nemotron_cc.py +++ b/tests/download/test_nemotron_cc.py @@ -11,8 +11,8 @@ from iris.marin_fs import open_url as _real_open_url from marin.download.nemotron_cc.download_nemotron_cc import NemotronIngressConfig, download_nemotron_cc -_OPEN_URL_TARGET = "marin.download.nemotron_cc.download_nemotron_cc.open_url" -_REQUESTS_SESSION_TARGET = "marin.download.nemotron_cc.download_nemotron_cc.requests.Session" +_OPEN_URL_TARGET = "marin.datakit.download.nemotron_cc.open_url" +_REQUESTS_SESSION_TARGET = "marin.datakit.download.nemotron_cc.requests.Session" SAMPLE_NEMOTRON_RECORDS = [ { From 3aa8bcc16af8b0e530495d101a19b837b8aaf079 Mon Sep 17 00:00:00 2001 From: Rafal Wojdyla Date: Wed, 25 Mar 2026 11:05:04 -0700 Subject: [PATCH 09/56] Migrate all imports from marin.download.* to marin.datakit.download.* Updates 23 files across experiments/, tests/, and lib/ to import from the canonical marin.datakit.download.* paths. Removes the stale datakit/download.py file left over from the package conversion. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../common_pile/tokenize_common_pile.py | 2 +- experiments/defaults.py | 2 +- experiments/eval_datasets.py | 2 +- .../evals/exp1600_uncheatable_evals.py | 2 +- experiments/midtraining_datasets.py | 2 +- experiments/models.py | 2 +- .../download_and_tokenize_fineweb2_hq.py | 2 +- experiments/paloma.py | 2 +- experiments/posttrain/preference_datasets.py | 2 +- experiments/pretraining_datasets/dolma.py | 2 +- experiments/pretraining_datasets/dolmino.py | 2 +- experiments/pretraining_datasets/nemotron.py | 2 +- .../pretraining_datasets/nemotron_v2.py | 2 +- experiments/tootsie/exp1063_upload_tootsie.py | 2 +- .../eval_datasets_overlap.py | 2 +- experiments/two_stage/data.py | 2 +- lib/marin/src/marin/datakit/download.py | 62 ------------------- .../tokenize/download_pretokenized.py | 2 +- .../marin/speedrun/paloma_local_download.py | 4 +- .../transform/dolmino/transform_dclm_hq.py | 4 +- tests/download/test_ar5iv.py | 2 +- tests/download/test_dclm_hq.py | 2 +- tests/download/test_nemotron_cc.py | 2 +- tests/test_hfdataset_spec.py | 2 +- 24 files changed, 25 insertions(+), 87 deletions(-) delete mode 100644 lib/marin/src/marin/datakit/download.py diff --git a/experiments/common_pile/tokenize_common_pile.py b/experiments/common_pile/tokenize_common_pile.py index cf90e364ee..faee07fc76 100644 --- a/experiments/common_pile/tokenize_common_pile.py +++ b/experiments/common_pile/tokenize_common_pile.py @@ -5,7 +5,7 @@ from experiments.defaults import default_tokenize from experiments.llama import llama3_tokenizer -from marin.download.huggingface.download_hf import DownloadConfig, download_hf +from marin.datakit.download.huggingface import DownloadConfig, download_hf from marin.execution.executor import ExecutorStep, executor_main, this_output_path from marin.processing.tokenize.data_configs import TokenizerStep, lm_mixture_data_config diff --git a/experiments/defaults.py b/experiments/defaults.py index 2636c945b5..01e9583442 100644 --- a/experiments/defaults.py +++ b/experiments/defaults.py @@ -46,7 +46,7 @@ from experiments.simple_sft_config import SimpleSFTConfig from experiments.simple_train_config import SimpleTrainConfig from levanter.utils.mesh import MeshConfig -from marin.download.huggingface.download_hf import DownloadConfig, download_hf +from marin.datakit.download.huggingface import DownloadConfig, download_hf from marin.evaluation.evaluation_config import EvalTaskConfig from marin.execution.executor import ( ExecutorStep, diff --git a/experiments/eval_datasets.py b/experiments/eval_datasets.py index 1a79a4a994..f55df8b3fc 100644 --- a/experiments/eval_datasets.py +++ b/experiments/eval_datasets.py @@ -3,7 +3,7 @@ import dataclasses -from marin.download.huggingface.download_hf import DownloadConfig, download_hf +from marin.datakit.download.huggingface import DownloadConfig, download_hf from marin.execution.executor import ExecutorStep, executor_main, this_output_path, versioned from marin.transform.huggingface.dataset_to_eval import DatasetConversionConfig, OutputFormatOptions, hf_dataset_to_jsonl diff --git a/experiments/evals/exp1600_uncheatable_evals.py b/experiments/evals/exp1600_uncheatable_evals.py index 50f57d63df..e2787a3351 100644 --- a/experiments/evals/exp1600_uncheatable_evals.py +++ b/experiments/evals/exp1600_uncheatable_evals.py @@ -22,7 +22,7 @@ from experiments.models import ModelConfig as HFModelConfig, download_model_step from fray.cluster import ResourceConfig from levanter.compat.hf_checkpoints import HFCheckpointConverter -from marin.download.uncheatable_eval.download import make_uncheatable_eval_step +from marin.datakit.download.uncheatable_eval import make_uncheatable_eval_step from marin.evaluation.log_probs import default_lm_log_probs from marin.execution.executor import ExecutorStep, executor_main, output_path_of from marin.processing.tokenize import TokenizeConfig diff --git a/experiments/midtraining_datasets.py b/experiments/midtraining_datasets.py index f96217880d..2706f8a4e9 100644 --- a/experiments/midtraining_datasets.py +++ b/experiments/midtraining_datasets.py @@ -4,7 +4,7 @@ from experiments.common_pile.tokenize_common_pile import stackv2_edu_filtered from experiments.defaults import default_download, default_tokenize from experiments.llama import llama3_tokenizer -from marin.download.huggingface.download_hf import DownloadConfig, download_hf +from marin.datakit.download.huggingface import DownloadConfig, download_hf from marin.execution import versioned from marin.execution.executor import ExecutorStep, this_output_path from marin.processing.tokenize import lm_mixture_data_config diff --git a/experiments/models.py b/experiments/models.py index 9e2a2db79b..972ca4f753 100644 --- a/experiments/models.py +++ b/experiments/models.py @@ -18,7 +18,7 @@ from dataclasses import dataclass -from marin.download.huggingface.download_hf import DownloadConfig, download_hf +from marin.datakit.download.huggingface import DownloadConfig, download_hf from marin.execution.executor import ExecutorStep, this_output_path, versioned from marin.utils import get_directory_friendly_name diff --git a/experiments/multilingual_fineweb2_hq/download_and_tokenize_fineweb2_hq.py b/experiments/multilingual_fineweb2_hq/download_and_tokenize_fineweb2_hq.py index 6c93fba71a..a3fd2ae82a 100644 --- a/experiments/multilingual_fineweb2_hq/download_and_tokenize_fineweb2_hq.py +++ b/experiments/multilingual_fineweb2_hq/download_and_tokenize_fineweb2_hq.py @@ -13,7 +13,7 @@ from experiments.llama import llama3_tokenizer from experiments.multilingual_fineweb2_hq.constants import FINEWEB2_DATASETS -from marin.download.huggingface.download_hf import DownloadConfig, download_hf +from marin.datakit.download.huggingface import DownloadConfig, download_hf from marin.execution.executor import ExecutorStep, executor_main, output_path_of, this_output_path, versioned from marin.processing.tokenize import TokenizeConfig, tokenize from marin.processing.tokenize.data_configs import TokenizerStep diff --git a/experiments/paloma.py b/experiments/paloma.py index 74bd98e25f..24c1a536df 100644 --- a/experiments/paloma.py +++ b/experiments/paloma.py @@ -9,7 +9,7 @@ import os.path -from marin.download.huggingface.download_hf import DownloadConfig as HfDownloadConfig, download_hf +from marin.datakit.download.huggingface import DownloadConfig as HfDownloadConfig, download_hf # cyclic dependency # from experiments.llama import llama3_tokenizer diff --git a/experiments/posttrain/preference_datasets.py b/experiments/posttrain/preference_datasets.py index e93e94a61b..105722d2af 100644 --- a/experiments/posttrain/preference_datasets.py +++ b/experiments/posttrain/preference_datasets.py @@ -22,7 +22,7 @@ from collections.abc import Sequence from dataclasses import dataclass, field -from marin.download.huggingface.download_hf import DownloadConfig, download_hf +from marin.datakit.download.huggingface import DownloadConfig, download_hf from marin.execution.executor import ( ExecutorStep, executor_main, diff --git a/experiments/pretraining_datasets/dolma.py b/experiments/pretraining_datasets/dolma.py index 5c176c01f7..02b62df0aa 100644 --- a/experiments/pretraining_datasets/dolma.py +++ b/experiments/pretraining_datasets/dolma.py @@ -10,7 +10,7 @@ import os.path -from marin.download.huggingface.download_hf import DownloadConfig, download_hf +from marin.datakit.download.huggingface import DownloadConfig, download_hf from marin.execution.executor import ExecutorStep, this_output_path, versioned, InputName from marin.processing.tokenize import TokenizeConfig, tokenize from marin.processing.tokenize.data_configs import TokenizerStep diff --git a/experiments/pretraining_datasets/dolmino.py b/experiments/pretraining_datasets/dolmino.py index 414e0e28dc..25dab84f52 100644 --- a/experiments/pretraining_datasets/dolmino.py +++ b/experiments/pretraining_datasets/dolmino.py @@ -5,7 +5,7 @@ import os.path -from marin.download.huggingface.download_hf import DownloadConfig, download_hf +from marin.datakit.download.huggingface import DownloadConfig, download_hf from marin.execution.executor import ExecutorStep, this_output_path, versioned from marin.processing.tokenize import TokenizeConfig, tokenize from marin.processing.tokenize.data_configs import TokenizerStep diff --git a/experiments/pretraining_datasets/nemotron.py b/experiments/pretraining_datasets/nemotron.py index 52c9d17d69..ac981b9720 100644 --- a/experiments/pretraining_datasets/nemotron.py +++ b/experiments/pretraining_datasets/nemotron.py @@ -8,7 +8,7 @@ from experiments.defaults import DEFAULT_NEW_RUN_DATA_SHUFFLE from experiments.pretraining_datasets.dclm import dclm_components_llama3 -from marin.download.nemotron_cc.download_nemotron_cc import NemotronIngressConfig, download_nemotron_cc +from marin.datakit.download.nemotron_cc import NemotronIngressConfig, download_nemotron_cc from marin.execution.executor import ExecutorStep, output_path_of, this_output_path, versioned from marin.processing.tokenize import TokenizeConfig, lm_mixture_data_config, tokenize from marin.processing.tokenize.data_configs import TokenizerStep diff --git a/experiments/pretraining_datasets/nemotron_v2.py b/experiments/pretraining_datasets/nemotron_v2.py index 66d618ad53..ccb79f9e14 100644 --- a/experiments/pretraining_datasets/nemotron_v2.py +++ b/experiments/pretraining_datasets/nemotron_v2.py @@ -14,7 +14,7 @@ import os.path -from marin.download.huggingface.download_hf import DownloadConfig, download_hf +from marin.datakit.download.huggingface import DownloadConfig, download_hf from marin.execution.executor import ExecutorStep, this_output_path, versioned from marin.processing.tokenize import TokenizeConfig, tokenize from marin.processing.tokenize.data_configs import TokenizerStep diff --git a/experiments/tootsie/exp1063_upload_tootsie.py b/experiments/tootsie/exp1063_upload_tootsie.py index 55d06ec875..d12aa5e060 100644 --- a/experiments/tootsie/exp1063_upload_tootsie.py +++ b/experiments/tootsie/exp1063_upload_tootsie.py @@ -25,7 +25,7 @@ from dataclasses import dataclass, field -from marin.download.huggingface.upload_gcs_to_hf import UploadConfig, upload_gcs_to_hf +from marin.datakit.download.upload_gcs_to_hf import UploadConfig, upload_gcs_to_hf from marin.execution.executor import ExecutorStep, executor_main diff --git a/experiments/train_test_overlap/eval_datasets_overlap.py b/experiments/train_test_overlap/eval_datasets_overlap.py index c6e7469221..b7df8679aa 100644 --- a/experiments/train_test_overlap/eval_datasets_overlap.py +++ b/experiments/train_test_overlap/eval_datasets_overlap.py @@ -1,7 +1,7 @@ # Copyright The Marin Authors # SPDX-License-Identifier: Apache-2.0 -from marin.download.huggingface.download_hf import DownloadConfig, download_hf +from marin.datakit.download.huggingface import DownloadConfig, download_hf from marin.execution.executor import ExecutorStep, executor_main, this_output_path, versioned from marin.transform.huggingface.dataset_to_eval import DatasetConversionConfig, OutputFormatOptions, hf_dataset_to_jsonl diff --git a/experiments/two_stage/data.py b/experiments/two_stage/data.py index 9aeca84456..c78daf0ab1 100644 --- a/experiments/two_stage/data.py +++ b/experiments/two_stage/data.py @@ -6,7 +6,7 @@ from experiments.midtraining_datasets import finemath_3_plus_tokenized from experiments.pretraining_datasets import tokenize_dolma from experiments.pretraining_datasets.simple import tokenized -from marin.download.huggingface.download_hf import DownloadConfig, download_hf +from marin.datakit.download.huggingface import DownloadConfig, download_hf from marin.execution.executor import ExecutorStep, this_output_path dolma_components = tokenize_dolma() diff --git a/lib/marin/src/marin/datakit/download.py b/lib/marin/src/marin/datakit/download.py deleted file mode 100644 index 0724472143..0000000000 --- a/lib/marin/src/marin/datakit/download.py +++ /dev/null @@ -1,62 +0,0 @@ -# Copyright The Marin Authors -# SPDX-License-Identifier: Apache-2.0 - -"""Datakit download stage — fetch a HuggingFace dataset to persistent storage.""" - -from marin.download.huggingface.download_hf import DownloadConfig, download_hf -from marin.execution.step_spec import StepSpec - - -def download_step( - name: str, - *, - hf_dataset_id: str, - revision: str, - hf_urls_glob: list[str] | None = None, - zephyr_max_parallelism: int = 8, - deps: list[StepSpec] | None = None, - output_path_prefix: str | None = None, - override_output_path: str | None = None, -) -> StepSpec: - """Create a StepSpec that downloads a HuggingFace dataset. - - The raw download is preserved as-is in its original format and directory structure. - - Args: - name: Step name (e.g. "fineweb/download"). - hf_dataset_id: HuggingFace dataset identifier (e.g. "HuggingFaceFW/fineweb"). - revision: Commit hash from the HF dataset repo. - hf_urls_glob: Glob patterns to select specific files. Empty means all files. - zephyr_max_parallelism: Maximum download parallelism. - deps: Optional upstream dependencies. - output_path_prefix: Override the default output path prefix. - override_output_path: Override the computed output path entirely. - - Returns: - A StepSpec whose output_path contains the raw downloaded files. - """ - resolved_glob = hf_urls_glob or [] - - def _run(output_path: str) -> None: - download_hf( - DownloadConfig( - hf_dataset_id=hf_dataset_id, - revision=revision, - hf_urls_glob=resolved_glob, - gcs_output_path=output_path, - zephyr_max_parallelism=zephyr_max_parallelism, - ) - ) - - return StepSpec( - name=name, - fn=_run, - deps=deps or [], - hash_attrs={ - "hf_dataset_id": hf_dataset_id, - "revision": revision, - "hf_urls_glob": resolved_glob, - }, - output_path_prefix=output_path_prefix, - override_output_path=override_output_path, - ) diff --git a/lib/marin/src/marin/processing/tokenize/download_pretokenized.py b/lib/marin/src/marin/processing/tokenize/download_pretokenized.py index 91fdaca495..cab2433bec 100644 --- a/lib/marin/src/marin/processing/tokenize/download_pretokenized.py +++ b/lib/marin/src/marin/processing/tokenize/download_pretokenized.py @@ -18,7 +18,7 @@ ) from levanter.store.cache import CacheOptions -from marin.download.huggingface.download_hf import ( +from marin.datakit.download.huggingface import ( DownloadConfig as HfDownloadConfig, download_hf as hf_download_logic, ) diff --git a/lib/marin/src/marin/speedrun/paloma_local_download.py b/lib/marin/src/marin/speedrun/paloma_local_download.py index c7335a52c5..e2ee68f766 100644 --- a/lib/marin/src/marin/speedrun/paloma_local_download.py +++ b/lib/marin/src/marin/speedrun/paloma_local_download.py @@ -8,8 +8,8 @@ """ from experiments.paloma import paloma_tokenized -from marin.download import HfDownloadConfig -from marin.download.huggingface.download_hf import download_hf +from marin.datakit.download.huggingface import DownloadConfig as HfDownloadConfig +from marin.datakit.download.huggingface import download_hf from marin.execution.executor import ExecutorStep, executor_main, this_output_path, versioned llama3_tokenizer = "meta-llama/Meta-Llama-3.1-8B" diff --git a/lib/marin/src/marin/transform/dolmino/transform_dclm_hq.py b/lib/marin/src/marin/transform/dolmino/transform_dclm_hq.py index 773cb3242a..dfaf263121 100644 --- a/lib/marin/src/marin/transform/dolmino/transform_dclm_hq.py +++ b/lib/marin/src/marin/transform/dolmino/transform_dclm_hq.py @@ -40,8 +40,8 @@ import draccus from iris.marin_fs import open_url, url_to_fs -from marin.download.dclm_hq.download_dclm_hq_html import find_html_in_cc -from marin.download.huggingface.stream_remove_columns import hf_fs +from marin.datakit.download.dclm_hq import find_html_in_cc +from marin.datakit.download.stream_remove_columns import hf_fs from marin.schemas.web.convert import ExtractionConfig from marin.web.convert import convert_page from tqdm import tqdm diff --git a/tests/download/test_ar5iv.py b/tests/download/test_ar5iv.py index 442d557637..570fb706e3 100644 --- a/tests/download/test_ar5iv.py +++ b/tests/download/test_ar5iv.py @@ -7,7 +7,7 @@ import pytest -from marin.download.ar5iv.download import DownloadConfig, download +from marin.datakit.download.ar5iv import Ar5ivDownloadConfig as DownloadConfig, download @pytest.fixture diff --git a/tests/download/test_dclm_hq.py b/tests/download/test_dclm_hq.py index 21ec33c5b7..4ca4f48aef 100644 --- a/tests/download/test_dclm_hq.py +++ b/tests/download/test_dclm_hq.py @@ -7,7 +7,7 @@ from unittest.mock import patch import zstandard as zstd -from marin.download.dclm_hq.download_dclm_hq_html import DCLMHQDownloadConfig, extract_dclm_hq_dump +from marin.datakit.download.dclm_hq import DCLMHQDownloadConfig, extract_dclm_hq_dump SAMPLE_DCLM_RECORDS = [ { diff --git a/tests/download/test_nemotron_cc.py b/tests/download/test_nemotron_cc.py index 04ac04e2af..4657d008eb 100644 --- a/tests/download/test_nemotron_cc.py +++ b/tests/download/test_nemotron_cc.py @@ -9,7 +9,7 @@ import pytest import zstandard as zstd from iris.marin_fs import open_url as _real_open_url -from marin.download.nemotron_cc.download_nemotron_cc import NemotronIngressConfig, download_nemotron_cc +from marin.datakit.download.nemotron_cc import NemotronIngressConfig, download_nemotron_cc _OPEN_URL_TARGET = "marin.datakit.download.nemotron_cc.open_url" _REQUESTS_SESSION_TARGET = "marin.datakit.download.nemotron_cc.requests.Session" diff --git a/tests/test_hfdataset_spec.py b/tests/test_hfdataset_spec.py index 7bdd0d535c..14ad782471 100644 --- a/tests/test_hfdataset_spec.py +++ b/tests/test_hfdataset_spec.py @@ -2,7 +2,7 @@ # SPDX-License-Identifier: Apache-2.0 from experiments.defaults import default_download, default_tokenize -from marin.download.huggingface.download_hf import DownloadConfig +from marin.datakit.download.huggingface import DownloadConfig from marin.processing.tokenize import HfDatasetSpec from marin.processing.tokenize.tokenize import HfTokenizeConfig, TokenizeConfig From ef02bd877aa0f444377e963e2ea70004a3b7d7c5 Mon Sep 17 00:00:00 2001 From: Rafal Wojdyla Date: Wed, 25 Mar 2026 11:08:42 -0700 Subject: [PATCH 10/56] Migrate imports to canonical paths and simplify download functions Updates all 23 consumer files to import from marin.datakit.download.* instead of marin.download.*. Refactors download functions (transfer_files, download_nemotron_cc, extract_dclm_hq_dump) to accept plain parameters instead of requiring config dataclass construction. Config classes are kept for backward compat with ExecutorStep callers. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../src/marin/datakit/download/dclm_hq.py | 35 ++++++----- .../src/marin/datakit/download/filesystem.py | 63 ++++++++----------- .../src/marin/datakit/download/nemotron_cc.py | 23 +++++-- 3 files changed, 64 insertions(+), 57 deletions(-) diff --git a/lib/marin/src/marin/datakit/download/dclm_hq.py b/lib/marin/src/marin/datakit/download/dclm_hq.py index 83c127c079..ab00c2037d 100644 --- a/lib/marin/src/marin/datakit/download/dclm_hq.py +++ b/lib/marin/src/marin/datakit/download/dclm_hq.py @@ -172,35 +172,40 @@ def process_file(task: FileTask) -> None: raise -def extract_dclm_hq_dump(cfg: DCLMHQDownloadConfig) -> None: - """Process the DCLM HQ dump in the input path and save the results to the output path. +def extract_dclm_hq_dump(input_path_or_cfg: str | DCLMHQDownloadConfig, output_path: str | None = None) -> None: + """Process the DCLM HQ dump and enrich with HTML from Common Crawl. - Flattens the nested directory structure (shards → files) into a single list of files - and processes them in parallel using zephyr. + Args: + input_path_or_cfg: Input directory path, or a DCLMHQDownloadConfig for backward compat. + output_path: Output directory path. Required when input_path_or_cfg is a string. """ - logger.info(f"Starting processing of DCLM HQ dump in {cfg.input_path}") + if isinstance(input_path_or_cfg, DCLMHQDownloadConfig): + input_path = input_path_or_cfg.input_path + output_path = input_path_or_cfg.output_path + else: + input_path = input_path_or_cfg + if output_path is None: + raise ValueError("output_path is required when input_path_or_cfg is a string") + + logger.info(f"Starting processing of DCLM HQ dump in {input_path}") - # Flatten nested structure: discover all files upfront all_files = [] - paths = [i.split("/")[-1] for i in fsspec_glob(os.path.join(cfg.input_path, "*"))] + paths = [i.split("/")[-1] for i in fsspec_glob(os.path.join(input_path, "*"))] logger.info(f"Found {len(paths)} shards to process") for path in paths: - input_path = os.path.join(cfg.input_path, path) - shard_paths = fsspec_glob(os.path.join(input_path, "*.json.zst")) + shard_input = os.path.join(input_path, path) + shard_paths = fsspec_glob(os.path.join(shard_input, "*.json.zst")) for shard_path in shard_paths: - input_file_path = shard_path - output_file_path = os.path.join(cfg.output_path, path, os.path.basename(shard_path)).replace( + output_file_path = os.path.join(output_path, path, os.path.basename(shard_path)).replace( ".json.zst", ".jsonl.gz" ) - - all_files.append(FileTask(input_file_path=input_file_path, output_file_path=output_file_path)) + all_files.append(FileTask(input_file_path=shard_path, output_file_path=output_file_path)) logger.info(f"Found {len(all_files)} files to process") - # Single-level parallelism over all files pipeline = Dataset.from_list(all_files).map(process_file) ctx = ZephyrContext(name="download-dclm-html") @@ -220,7 +225,7 @@ def dclm_hq_step( """Create a StepSpec that downloads DCLM HQ HTML data from Common Crawl.""" def _run(output_path: str) -> None: - extract_dclm_hq_dump(DCLMHQDownloadConfig(input_path=input_path, output_path=output_path)) + extract_dclm_hq_dump(input_path, output_path) return StepSpec( name=name, diff --git a/lib/marin/src/marin/datakit/download/filesystem.py b/lib/marin/src/marin/datakit/download/filesystem.py index 287426666f..0177d644ef 100644 --- a/lib/marin/src/marin/datakit/download/filesystem.py +++ b/lib/marin/src/marin/datakit/download/filesystem.py @@ -1,6 +1,7 @@ # Copyright The Marin Authors # SPDX-License-Identifier: Apache-2.0 +import logging import os import random import time @@ -12,61 +13,58 @@ from marin.utils import fsspec_exists, fsspec_glob +logger = logging.getLogger(__name__) + @dataclass class TransferConfig: + """Kept for backward compatibility. Prefer ``transfer_files()`` with flat params.""" + input_path: str output_path: str - - # Selectively choose the number of random files to transfer. None means all files num_random_files: int | None = None filetype: str = "jsonl.zst" -def transfer_files(config: TransferConfig) -> None: - """Transfers files from the input path to the output path. +def transfer_files( + input_path: str, + output_path: str, + *, + num_random_files: int | None = None, + filetype: str = "jsonl.zst", +) -> None: + """Transfer files from input_path to output_path. - When num_random_files is None, copies the entire directory recursively. - When num_random_files is specified, randomly samples that many files and - copies them in parallel using zephyr. + When num_random_files is None, copies all matching files. + When specified, randomly samples that many files. """ - if config.input_path.endswith("/"): - input_path = config.input_path[:-1] - else: - input_path = config.input_path + input_path = input_path.rstrip("/") - print(f"Downloading {input_path} from GCS.") - start_time: float = time.time() + logger.info("Transferring %s to %s", input_path, output_path) + start_time = time.time() fs, _ = url_to_fs(input_path) if not fs.exists(input_path): raise FileNotFoundError(f"{input_path} does not exist.") - # Glob all matching files - filenames = fsspec_glob(os.path.join(input_path, f"**/*.{config.filetype}")) + filenames = fsspec_glob(os.path.join(input_path, f"**/*.{filetype}")) - # Select files: either random sample or all files - if config.num_random_files is None: - selected_files = filenames - else: + if num_random_files is not None: random.seed(42) random.shuffle(filenames) - selected_files = filenames[: config.num_random_files] + filenames = filenames[:num_random_files] def copy_file(filename: str) -> None: - """Copy a single file if it doesn't already exist at destination.""" - output_filename = os.path.join(config.output_path, os.path.basename(filename)) + output_filename = os.path.join(output_path, os.path.basename(filename)) if not fsspec_exists(output_filename): - # Ensure output directory exists - fs.makedirs(config.output_path, exist_ok=True) + fs.makedirs(output_path, exist_ok=True) fs.copy(filename, output_filename) - # Always use parallel copying via zephyr - pipeline = Dataset.from_list(selected_files).map(copy_file) + pipeline = Dataset.from_list(filenames).map(copy_file) ctx = ZephyrContext(name="fs-transfer") ctx.execute(pipeline) - elapsed_time_seconds: float = time.time() - start_time - print(f"Downloaded {input_path} to {config.output_path} ({elapsed_time_seconds}s).") + elapsed = time.time() - start_time + logger.info("Transferred %s to %s (%.1fs)", input_path, output_path, elapsed) def transfer_step( @@ -82,14 +80,7 @@ def transfer_step( """Create a StepSpec that transfers files between fsspec paths.""" def _run(output_path: str) -> None: - transfer_files( - TransferConfig( - input_path=input_path, - output_path=output_path, - num_random_files=num_random_files, - filetype=filetype, - ) - ) + transfer_files(input_path, output_path, num_random_files=num_random_files, filetype=filetype) return StepSpec( name=name, diff --git a/lib/marin/src/marin/datakit/download/nemotron_cc.py b/lib/marin/src/marin/datakit/download/nemotron_cc.py index 4b32983091..ba06ba00f2 100644 --- a/lib/marin/src/marin/datakit/download/nemotron_cc.py +++ b/lib/marin/src/marin/datakit/download/nemotron_cc.py @@ -87,11 +87,22 @@ def download_single_nemotron_path(input_file_path: str, output_file_path: str) - @dataclass class NemotronIngressConfig: + """Kept for backward compatibility with ExecutorStep callers.""" + output_path: str = THIS_OUTPUT_PATH -def download_nemotron_cc(cfg: NemotronIngressConfig): - paths_file_path = os.path.join(cfg.output_path, "data-jsonl.paths") +def download_nemotron_cc(output_path_or_cfg: str | NemotronIngressConfig) -> None: + """Download and process Nemotron-CC dataset from Common Crawl. + + Args: + output_path_or_cfg: Output directory path, or a NemotronIngressConfig for backward compat. + """ + output_path = ( + output_path_or_cfg.output_path if isinstance(output_path_or_cfg, NemotronIngressConfig) else output_path_or_cfg + ) + + paths_file_path = os.path.join(output_path, "data-jsonl.paths") logger.info(f"Downloading Nemotron CC path file {paths_file_path}") with open_url(NCC_PATH_FILE_URL, "rb") as f, open_url(paths_file_path, "wb") as f_out: @@ -102,7 +113,7 @@ def download_nemotron_cc(cfg: NemotronIngressConfig): with open_url(paths_file_path, "r", compression="gzip") as f: for line in f: file = line.strip() - output_file_path = os.path.join(cfg.output_path, file).replace("jsonl.zstd", "jsonl.zst") + output_file_path = os.path.join(output_path, file).replace("jsonl.zstd", "jsonl.zst") all_files.append((file, output_file_path)) logger.info(f"Processing {len(all_files)} Nemotron CC files") @@ -111,13 +122,13 @@ def download_nemotron_cc(cfg: NemotronIngressConfig): Dataset.from_list(all_files) .filter(lambda file_info: not fsspec_exists(file_info[1])) .map(lambda file_info: download_single_nemotron_path(*file_info)) - .write_jsonl(os.path.join(cfg.output_path, ".metrics/download-{shard:05d}.jsonl"), skip_existing=True) + .write_jsonl(os.path.join(output_path, ".metrics/download-{shard:05d}.jsonl"), skip_existing=True) ) ctx = ZephyrContext(name="download-nemotron-cc") ctx.execute(pipeline) - logger.info(f"Downloaded Nemotron CC files to {cfg.output_path}") + logger.info(f"Downloaded Nemotron CC files to {output_path}") def nemotron_cc_step( @@ -130,7 +141,7 @@ def nemotron_cc_step( """Create a StepSpec that downloads the Nemotron-CC dataset from Common Crawl.""" def _run(output_path: str) -> None: - download_nemotron_cc(NemotronIngressConfig(output_path=output_path)) + download_nemotron_cc(output_path) return StepSpec( name=name, From 4c8f38f3a52688589446478c45e272a81bd2c14a Mon Sep 17 00:00:00 2001 From: Rafal Wojdyla Date: Wed, 25 Mar 2026 12:09:47 -0700 Subject: [PATCH 11/56] Remove unused config dataclasses from download functions Removes NemotronIngressConfig, DCLMHQDownloadConfig, and TransferConfig. The underlying functions (download_nemotron_cc, extract_dclm_hq_dump, transfer_files) now take plain parameters directly. Updates tests and nemotron.py experiment to use the flat-param API or *_step() functions. Co-Authored-By: Claude Opus 4.6 (1M context) --- experiments/pretraining_datasets/nemotron.py | 10 ++------ .../src/marin/datakit/download/dclm_hq.py | 23 ++----------------- .../src/marin/datakit/download/filesystem.py | 11 --------- .../src/marin/datakit/download/nemotron_cc.py | 20 ++-------------- .../download/dclm_hq/download_dclm_hq_html.py | 1 - .../src/marin/download/filesystem/transfer.py | 1 - .../nemotron_cc/download_nemotron_cc.py | 1 - tests/download/test_dclm_hq.py | 5 ++-- tests/download/test_nemotron_cc.py | 11 ++++----- 9 files changed, 12 insertions(+), 71 deletions(-) diff --git a/experiments/pretraining_datasets/nemotron.py b/experiments/pretraining_datasets/nemotron.py index ac981b9720..4c463d8e4f 100644 --- a/experiments/pretraining_datasets/nemotron.py +++ b/experiments/pretraining_datasets/nemotron.py @@ -8,20 +8,14 @@ from experiments.defaults import DEFAULT_NEW_RUN_DATA_SHUFFLE from experiments.pretraining_datasets.dclm import dclm_components_llama3 -from marin.datakit.download.nemotron_cc import NemotronIngressConfig, download_nemotron_cc +from marin.datakit.download.nemotron_cc import nemotron_cc_step from marin.execution.executor import ExecutorStep, output_path_of, this_output_path, versioned from marin.processing.tokenize import TokenizeConfig, lm_mixture_data_config, tokenize from marin.processing.tokenize.data_configs import TokenizerStep # Raw dataset download step downloads = { - "nemotron_cc": ExecutorStep( - name="raw/nemotro-cc", - fn=download_nemotron_cc, - config=NemotronIngressConfig( - output_path=this_output_path(), - ), - ) + "nemotron_cc": nemotron_cc_step("raw/nemotro-cc").as_executor_step(), } _nemotron_cc_path = output_path_of(downloads["nemotron_cc"], "contrib/Nemotron/Nemotron-CC/data-jsonl/") diff --git a/lib/marin/src/marin/datakit/download/dclm_hq.py b/lib/marin/src/marin/datakit/download/dclm_hq.py index ab00c2037d..a4301245aa 100644 --- a/lib/marin/src/marin/datakit/download/dclm_hq.py +++ b/lib/marin/src/marin/datakit/download/dclm_hq.py @@ -34,12 +34,6 @@ logger = logging.getLogger(__name__) -@dataclass -class DCLMHQDownloadConfig: - input_path: str - output_path: str - - @dataclass class FileTask: """Represents a single file processing task.""" @@ -172,21 +166,8 @@ def process_file(task: FileTask) -> None: raise -def extract_dclm_hq_dump(input_path_or_cfg: str | DCLMHQDownloadConfig, output_path: str | None = None) -> None: - """Process the DCLM HQ dump and enrich with HTML from Common Crawl. - - Args: - input_path_or_cfg: Input directory path, or a DCLMHQDownloadConfig for backward compat. - output_path: Output directory path. Required when input_path_or_cfg is a string. - """ - if isinstance(input_path_or_cfg, DCLMHQDownloadConfig): - input_path = input_path_or_cfg.input_path - output_path = input_path_or_cfg.output_path - else: - input_path = input_path_or_cfg - if output_path is None: - raise ValueError("output_path is required when input_path_or_cfg is a string") - +def extract_dclm_hq_dump(input_path: str, output_path: str) -> None: + """Process the DCLM HQ dump and enrich with HTML from Common Crawl.""" logger.info(f"Starting processing of DCLM HQ dump in {input_path}") all_files = [] diff --git a/lib/marin/src/marin/datakit/download/filesystem.py b/lib/marin/src/marin/datakit/download/filesystem.py index 0177d644ef..7ace48ab38 100644 --- a/lib/marin/src/marin/datakit/download/filesystem.py +++ b/lib/marin/src/marin/datakit/download/filesystem.py @@ -5,7 +5,6 @@ import os import random import time -from dataclasses import dataclass from iris.marin_fs import url_to_fs from marin.execution.step_spec import StepSpec @@ -16,16 +15,6 @@ logger = logging.getLogger(__name__) -@dataclass -class TransferConfig: - """Kept for backward compatibility. Prefer ``transfer_files()`` with flat params.""" - - input_path: str - output_path: str - num_random_files: int | None = None - filetype: str = "jsonl.zst" - - def transfer_files( input_path: str, output_path: str, diff --git a/lib/marin/src/marin/datakit/download/nemotron_cc.py b/lib/marin/src/marin/datakit/download/nemotron_cc.py index ba06ba00f2..0e65f307b9 100644 --- a/lib/marin/src/marin/datakit/download/nemotron_cc.py +++ b/lib/marin/src/marin/datakit/download/nemotron_cc.py @@ -14,12 +14,10 @@ import logging import os from collections.abc import Iterator -from dataclasses import dataclass import requests import zstandard from iris.marin_fs import open_url -from marin.execution import THIS_OUTPUT_PATH from marin.execution.step_spec import StepSpec from marin.utils import fsspec_exists from requests.adapters import HTTPAdapter @@ -85,22 +83,8 @@ def download_single_nemotron_path(input_file_path: str, output_file_path: str) - return {"input_file": input_file_path, "output_file": output_file_path, "num_records": num_records} -@dataclass -class NemotronIngressConfig: - """Kept for backward compatibility with ExecutorStep callers.""" - - output_path: str = THIS_OUTPUT_PATH - - -def download_nemotron_cc(output_path_or_cfg: str | NemotronIngressConfig) -> None: - """Download and process Nemotron-CC dataset from Common Crawl. - - Args: - output_path_or_cfg: Output directory path, or a NemotronIngressConfig for backward compat. - """ - output_path = ( - output_path_or_cfg.output_path if isinstance(output_path_or_cfg, NemotronIngressConfig) else output_path_or_cfg - ) +def download_nemotron_cc(output_path: str) -> None: + """Download and process Nemotron-CC dataset from Common Crawl.""" paths_file_path = os.path.join(output_path, "data-jsonl.paths") logger.info(f"Downloading Nemotron CC path file {paths_file_path}") diff --git a/lib/marin/src/marin/download/dclm_hq/download_dclm_hq_html.py b/lib/marin/src/marin/download/dclm_hq/download_dclm_hq_html.py index a49caab9d7..a98513e7df 100644 --- a/lib/marin/src/marin/download/dclm_hq/download_dclm_hq_html.py +++ b/lib/marin/src/marin/download/dclm_hq/download_dclm_hq_html.py @@ -2,7 +2,6 @@ # SPDX-License-Identifier: Apache-2.0 # Backward-compat shim. Canonical location: marin.datakit.download.dclm_hq -from marin.datakit.download.dclm_hq import DCLMHQDownloadConfig as DCLMHQDownloadConfig from marin.datakit.download.dclm_hq import FileTask as FileTask from marin.datakit.download.dclm_hq import extract_dclm_hq_dump as extract_dclm_hq_dump from marin.datakit.download.dclm_hq import fetch_warc_from_cc as fetch_warc_from_cc diff --git a/lib/marin/src/marin/download/filesystem/transfer.py b/lib/marin/src/marin/download/filesystem/transfer.py index 5456bf8cc5..045a360623 100644 --- a/lib/marin/src/marin/download/filesystem/transfer.py +++ b/lib/marin/src/marin/download/filesystem/transfer.py @@ -2,5 +2,4 @@ # SPDX-License-Identifier: Apache-2.0 # Backward-compat shim. Canonical location: marin.datakit.download.filesystem -from marin.datakit.download.filesystem import TransferConfig as TransferConfig from marin.datakit.download.filesystem import transfer_files as transfer_files diff --git a/lib/marin/src/marin/download/nemotron_cc/download_nemotron_cc.py b/lib/marin/src/marin/download/nemotron_cc/download_nemotron_cc.py index 81251cb66c..c7e8e16e54 100644 --- a/lib/marin/src/marin/download/nemotron_cc/download_nemotron_cc.py +++ b/lib/marin/src/marin/download/nemotron_cc/download_nemotron_cc.py @@ -2,6 +2,5 @@ # SPDX-License-Identifier: Apache-2.0 # Backward-compat shim. Canonical location: marin.datakit.download.nemotron_cc -from marin.datakit.download.nemotron_cc import NemotronIngressConfig as NemotronIngressConfig from marin.datakit.download.nemotron_cc import download_nemotron_cc as download_nemotron_cc from marin.datakit.download.nemotron_cc import download_single_nemotron_path as download_single_nemotron_path diff --git a/tests/download/test_dclm_hq.py b/tests/download/test_dclm_hq.py index 4ca4f48aef..c83b5e03fe 100644 --- a/tests/download/test_dclm_hq.py +++ b/tests/download/test_dclm_hq.py @@ -7,7 +7,7 @@ from unittest.mock import patch import zstandard as zstd -from marin.datakit.download.dclm_hq import DCLMHQDownloadConfig, extract_dclm_hq_dump +from marin.datakit.download.dclm_hq import extract_dclm_hq_dump SAMPLE_DCLM_RECORDS = [ { @@ -171,8 +171,7 @@ def mock_requests_get(url, **kwargs): raise ValueError(f"Unexpected URL: {url}") with patch("marin.datakit.download.dclm_hq.requests.get", side_effect=mock_requests_get): - cfg = DCLMHQDownloadConfig(input_path=str(tmp_path / "input"), output_path=str(output_dir)) - extract_dclm_hq_dump(cfg) + extract_dclm_hq_dump(str(tmp_path / "input"), str(output_dir)) # Verify output files were created in nested structure shard1_output = output_dir / "shard1" diff --git a/tests/download/test_nemotron_cc.py b/tests/download/test_nemotron_cc.py index 4657d008eb..e4e89e361a 100644 --- a/tests/download/test_nemotron_cc.py +++ b/tests/download/test_nemotron_cc.py @@ -9,7 +9,7 @@ import pytest import zstandard as zstd from iris.marin_fs import open_url as _real_open_url -from marin.datakit.download.nemotron_cc import NemotronIngressConfig, download_nemotron_cc +from marin.datakit.download.nemotron_cc import download_nemotron_cc _OPEN_URL_TARGET = "marin.datakit.download.nemotron_cc.open_url" _REQUESTS_SESSION_TARGET = "marin.datakit.download.nemotron_cc.requests.Session" @@ -114,8 +114,7 @@ def test_download_nemotron_cc_pipeline(tmp_path, mock_paths_open): patch(_OPEN_URL_TARGET, side_effect=mock_paths_open(paths)), patch(_REQUESTS_SESSION_TARGET, _mock_session_for({"file1": file1_data, "file2": file2_data})), ): - cfg = NemotronIngressConfig(output_path=str(output_dir)) - download_nemotron_cc(cfg) + download_nemotron_cc(str(output_dir)) all_records = read_all_jsonl_zst(output_dir / "contrib" / "Nemotron") @@ -152,8 +151,7 @@ def test_download_nemotron_cc_dolma_format(tmp_path, mock_paths_open): patch(_OPEN_URL_TARGET, side_effect=mock_paths_open(paths)), patch(_REQUESTS_SESSION_TARGET, _mock_session_for({"test": compressed_data})), ): - cfg = NemotronIngressConfig(output_path=str(output_dir)) - download_nemotron_cc(cfg) + download_nemotron_cc(str(output_dir)) records = read_all_jsonl_zst(output_dir / "contrib" / "Nemotron") assert len(records) == 1 @@ -188,8 +186,7 @@ def test_download_nemotron_cc_skips_existing(tmp_path, mock_paths_open): patch(_OPEN_URL_TARGET, side_effect=mock_paths_open(paths)), patch(_REQUESTS_SESSION_TARGET) as mock_session, ): - cfg = NemotronIngressConfig(output_path=str(output_dir)) - download_nemotron_cc(cfg) + download_nemotron_cc(str(output_dir)) mock_session.return_value.get.assert_not_called() assert existing_output.read_text() == "existing" From 16f5c20abcc9d705220a3b3b9f1f393105a86773 Mon Sep 17 00:00:00 2001 From: Rafal Wojdyla Date: Wed, 25 Mar 2026 12:13:23 -0700 Subject: [PATCH 12/56] Update datakit design doc: use Parquet instead of Vortex Switches the standard format from Vortex to Parquet throughout the design doc. Notes vortex#6905 as the blocking issue that motivated the change. Parquet provides the same columnar benefits with a mature ecosystem. Co-Authored-By: Claude Opus 4.6 (1M context) --- docs/design/2355_datakit.md | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/docs/design/2355_datakit.md b/docs/design/2355_datakit.md index 7ef15bf46e..0cb3ac8327 100644 --- a/docs/design/2355_datakit.md +++ b/docs/design/2355_datakit.md @@ -23,7 +23,7 @@ Download raw dataset from Hugging Face (or other sources). Raw downloads are pre Convert raw data into the **datakit standard format**: -* **File format**: Vortex \- columnar, supports pushdown filters and column projection, efficient lookup. +* **File format**: Parquet \- columnar, widely supported, supports pushdown filters and column projection. * **Mandatory columns**: * `id` \- unique document identifier (see [ID Column](#id-column) below) * `text` \- primary text content \- we enforce UTF-8 @@ -35,7 +35,7 @@ Convert raw data into the **datakit standard format**: * **Sort invariant**: each partition is sorted by `id` * **Typed output:** in the code the data has typed representation via `Artifact` -This is the "intake" step \- all downstream stages operate on normalized Vortex datasets. +This is the "intake" step \- all downstream stages operate on normalized Parquet datasets. ## 3\. Embed @@ -56,7 +56,7 @@ Join attributes datasets back to the source documents and apply filters: * Filter by classifier thresholds (e.g., quality score \> 0.8) * Remove duplicate spans/documents -Output is a clean, filtered Vortex dataset \- still sorted by `id`, still co-partitioned. +Output is a clean, filtered Parquet dataset \- still sorted by `id`, still co-partitioned. ## 8\. Tokenize @@ -66,15 +66,16 @@ Convert clean text into tokenized Levanter cache format. # Core Design Decisions -## Vortex as the Standard Format +## Parquet as the Standard Format -All intermediate datasets (from normalization through consolidation) use the Vortex columnar format. Benefits: +All intermediate datasets (from normalization through consolidation) use the Parquet columnar format. Benefits: * Column projection (only read the columns you need) * Filter pushdown * Efficient sorted merge joins via Zephyr +* Mature ecosystem with broad tooling support -NOTE: Vortex is much less mature than Parquet. This is a major concern. We will start with Vortex and if we hit roadblocks, revert to Parquet. +NOTE: We initially considered Vortex for its pushdown and lookup capabilities, but encountered blocking issues with Zephyr pipeline integration (see [vortex\#6905](https://github.com/vortex-data/vortex/issues/6905)). Parquet provides the same columnar benefits with a proven ecosystem. If Vortex matures, we can revisit. ## ID Column {#id-column} @@ -96,14 +97,14 @@ This is enforced by convention: each processing stage reads source partitions 1: ## Attributes Datasets {#attributes-datasets} -Processing stages (embed, classify, dedup) produce **attributes datasets** \- lightweight Vortex files containing: +Processing stages (embed, classify, dedup) produce **attributes datasets** \- lightweight Parquet files containing: * `id` — matching the source document ID * Stage-specific output columns (e.g., `quality_score`, `is_duplicate`, `topic_label`) Attributes datasets: -* Use Vortex format +* Use Parquet format * Are co-partitioned with the source (same shard count and key ranges) * Are sorted by `id` within each partition * Can be joined back to source documents via `sorted_merge_join` @@ -133,7 +134,7 @@ download = StepSpec( normalize = StepSpec( name="fineweb/normalize", deps=[download], - fn=lambda output_path: normalize_to_vortex( + fn=lambda output_path: normalize_to_parquet( input_path=download.output_path, output_path=output_path, text_field="text", ), hash_attrs={"text_field": "text"}, @@ -188,7 +189,7 @@ Core primitives — the reusable building blocks: ``` lib/marin/datakit/ - normalize # Raw format -> standard Vortex (id, text, ...) + normalize # Raw format -> standard Parquet (id, text, ...) embed # Document embedding classify # Quality/topic classification dedup # Deduplication (exact + fuzzy) @@ -201,7 +202,7 @@ Dataset-specific wiring \- which transforms to apply for a given dataset, expres # Execution Plan -* Implement `datakit/normalize.py` \- standard schema definitions, ID generation, raw format to Vortex conversion with mandatory columns +* Implement `datakit/normalize.py` \- standard schema definitions, ID generation, raw format to Parquet conversion with mandatory columns * Integration tests for the normalize step * Integration tests covering download, normalize, dedup and tokenize at reasonable scale * Update Grug/ferry experiment definitions to consume datakit pipeline outputs directly From 5af3272fcd8e61c2dc3177f5d2fb21ebdce6c28b Mon Sep 17 00:00:00 2001 From: Rafal Wojdyla Date: Wed, 25 Mar 2026 14:27:16 -0700 Subject: [PATCH 13/56] Remove global HfFileSystem() instance from stream_remove_columns Replace the module-level hf_fs = HfFileSystem() with per-call construction to avoid side effects at import time. Update the one external consumer (transform_dclm_hq.py) and test mock target. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../src/marin/datakit/download/stream_remove_columns.py | 5 ++--- .../src/marin/download/huggingface/stream_remove_columns.py | 1 - lib/marin/src/marin/transform/dolmino/transform_dclm_hq.py | 3 ++- tests/download/test_huggingface.py | 2 +- 4 files changed, 5 insertions(+), 6 deletions(-) diff --git a/lib/marin/src/marin/datakit/download/stream_remove_columns.py b/lib/marin/src/marin/datakit/download/stream_remove_columns.py index b16e3a1f1b..ba883ee944 100644 --- a/lib/marin/src/marin/datakit/download/stream_remove_columns.py +++ b/lib/marin/src/marin/datakit/download/stream_remove_columns.py @@ -13,7 +13,6 @@ from tqdm import tqdm from zephyr import Dataset, ZephyrContext -hf_fs = HfFileSystem() logger = logging.getLogger(__name__) @@ -29,7 +28,7 @@ def prune_stream_and_save(input_file: str, output_file: str, keep_columns: list[ output_file (str): Path where pruned parquet file will be saved keep_columns (list[str]): List of column names to retain """ - parquet_file = pq.ParquetFile(hf_fs.open(input_file)) + parquet_file = pq.ParquetFile(HfFileSystem().open(input_file)) full_df_list = [] for batch in tqdm(parquet_file.iter_batches(batch_size=10000), desc=f"Processing {input_file}"): @@ -58,7 +57,7 @@ def get_file_tasks(hf_path: str, output_path: str, keep_columns: list[str]): Dict with input_file, output_file, and keep_columns for each parquet file """ logger.info(f"Loading dataset from {hf_path}") - parquet_list = hf_fs.glob(f"{hf_path}/*.parquet") + parquet_list = HfFileSystem().glob(f"{hf_path}/*.parquet") for file in parquet_list: output_file = os.path.join(output_path, os.path.basename(file)) diff --git a/lib/marin/src/marin/download/huggingface/stream_remove_columns.py b/lib/marin/src/marin/download/huggingface/stream_remove_columns.py index 6d5d39f492..68a44db40c 100644 --- a/lib/marin/src/marin/download/huggingface/stream_remove_columns.py +++ b/lib/marin/src/marin/download/huggingface/stream_remove_columns.py @@ -4,6 +4,5 @@ from marin.datakit.download.stream_remove_columns import DatasetConfig as DatasetConfig from marin.datakit.download.stream_remove_columns import get_file_tasks as get_file_tasks -from marin.datakit.download.stream_remove_columns import hf_fs as hf_fs from marin.datakit.download.stream_remove_columns import prune_hf_dataset as prune_hf_dataset from marin.datakit.download.stream_remove_columns import prune_stream_and_save as prune_stream_and_save diff --git a/lib/marin/src/marin/transform/dolmino/transform_dclm_hq.py b/lib/marin/src/marin/transform/dolmino/transform_dclm_hq.py index dfaf263121..42f04264bf 100644 --- a/lib/marin/src/marin/transform/dolmino/transform_dclm_hq.py +++ b/lib/marin/src/marin/transform/dolmino/transform_dclm_hq.py @@ -41,7 +41,7 @@ import draccus from iris.marin_fs import open_url, url_to_fs from marin.datakit.download.dclm_hq import find_html_in_cc -from marin.datakit.download.stream_remove_columns import hf_fs +from huggingface_hub import HfFileSystem from marin.schemas.web.convert import ExtractionConfig from marin.web.convert import convert_page from tqdm import tqdm @@ -115,6 +115,7 @@ def process_dclm_hq_dump(cfg: DCLMHQExtractionConfig) -> None: # Glob all files across all shards upfront all_files = [] + hf_fs = HfFileSystem() paths = [i.split("/")[-1] for i in hf_fs.ls(cfg.input_hf_path, detail=False)] paths = paths[: cfg.max_split] if cfg.max_split else paths diff --git a/tests/download/test_huggingface.py b/tests/download/test_huggingface.py index 24a5bc6169..4d16eadf6b 100644 --- a/tests/download/test_huggingface.py +++ b/tests/download/test_huggingface.py @@ -189,7 +189,7 @@ def create_buffer(): mock_fs.glob = Mock(return_value=["hf://datasets/test-org/test-dataset@main/data/file.parquet"]) mock_fs.open = Mock(side_effect=lambda path, mode="rb": create_buffer()) - with patch("marin.datakit.download.stream_remove_columns.hf_fs", mock_fs): + with patch("marin.datakit.download.stream_remove_columns.HfFileSystem", return_value=mock_fs): prune_hf_dataset(cfg) # Verify output From f6959ecc1d1c72a2a25116bf2cddffdaafcbfabe Mon Sep 17 00:00:00 2001 From: Rafal Wojdyla Date: Wed, 25 Mar 2026 14:28:49 -0700 Subject: [PATCH 14/56] Inline pretraining downloads into simple.py, delete pretraining.py Removes the single-call wrapper functions in pretraining.py and inlines download_hf_step calls directly in simple.py via a _dl() helper. This eliminates the indirection of one function per dataset. Co-Authored-By: Claude Opus 4.6 (1M context) --- experiments/pretraining_datasets/simple.py | 68 +++++----- .../src/marin/datakit/download/pretraining.py | 119 ------------------ 2 files changed, 37 insertions(+), 150 deletions(-) delete mode 100644 lib/marin/src/marin/datakit/download/pretraining.py diff --git a/experiments/pretraining_datasets/simple.py b/experiments/pretraining_datasets/simple.py index 79910f3741..5fa9a5fa65 100644 --- a/experiments/pretraining_datasets/simple.py +++ b/experiments/pretraining_datasets/simple.py @@ -12,19 +12,7 @@ from levanter.data.text import TextLmDatasetFormat from levanter.store.cache import CacheOptions -from marin.datakit.download.pretraining import ( - dclm_baseline_download, - dclm_baseline_wrong_download, - dolma3_mix_150b_1025_download, - fineweb_download, - fineweb_edu_download, - proofpile_2_download, - slimpajama_6b_download, - slimpajama_download, - starcoderdata_download, - the_pile_openwebtext2_download, - the_stack_dedup_download, -) +from marin.datakit.download.huggingface import download_hf_step from marin.execution.executor import ExecutorStep, InputName, this_output_path, versioned from marin.processing.tokenize import TokenizeConfig, tokenize @@ -65,42 +53,60 @@ def _tokenize_simple( return step +def _dl(name: str, hf_dataset_id: str, revision: str, output_path: str) -> ExecutorStep: + """Create a download ExecutorStep from a StepSpec.""" + return download_hf_step( + name, hf_dataset_id=hf_dataset_id, revision=revision, override_output_path=output_path + ).as_executor_step() + + # ============================================================================ # RAW DATASET DOWNLOADS # ============================================================================ def _build_downloads() -> dict[str, ExecutorStep | InputName]: - """Build the downloads dict from canonical StepSpec definitions in pretraining.py.""" - fineweb_edu_base = fineweb_edu_download().as_executor_step() + fineweb_edu_base = _dl("raw/fineweb-edu", "HuggingFaceFW/fineweb-edu", "87f0914", "raw/fineweb-edu-87f0914") return { - "fineweb": fineweb_download().as_executor_step(), + "fineweb": _dl("raw/fineweb", "HuggingFaceFW/fineweb", "cd85054", "raw/fineweb"), "fineweb_edu": fineweb_edu_base.cd("data"), "fineweb_edu_sample_10bt": fineweb_edu_base.cd("sample/10BT"), "fineweb_edu_sample_100bt": fineweb_edu_base.cd("sample/100BT"), "fineweb_edu_sample_350bt": fineweb_edu_base.cd("sample/350BT"), "slimpajama": ( - slimpajama_download() - .as_executor_step() - .cd("2d0accd/huggingface.co/datasets/cerebras/SlimPajama-627B/resolve/2d0accd") + _dl("raw/SlimPajama-627B", "cerebras/SlimPajama-627B", "2d0accd", "raw/SlimPajama-627B-262830").cd( + "2d0accd/huggingface.co/datasets/cerebras/SlimPajama-627B/resolve/2d0accd" + ) + ), + "slimpajama_6b": ( + _dl("raw/SlimPajama-6B", "DKYoon/SlimPajama-6B", "b5f90f4", "raw/SlimPajama-6B-be35b7").cd("data") + ), + "dolma3_mix_150b_1025": ( + _dl( + "raw/dolma3_mix-150B-1025", "allenai/dolma3_mix-150B-1025", "15d04ee", "raw/dolma3_mix-150B-1025-15d04ee" + ).cd("15d04ee") + ), + "dclm_baseline_wrong": _dl( + "raw/dclm-baseline-1.0", "mlfoundations/dclm-baseline-1.0", "a3b142c", "raw/dclm_WRONG_20250211/" + ), + "dclm_baseline": ( + _dl("raw/dclm-baseline-1.0", "mlfoundations/dclm-baseline-1.0", "a3b142c", "raw/dclm").cd("a3b142c") + ), + "the_stack_dedup": ( + _dl("raw/the-stack-dedup", "bigcode/the-stack-dedup", "17cad72", "raw/the-stack-dedup-4ba450").cd("17cad72") ), - "slimpajama_6b": slimpajama_6b_download().as_executor_step().cd("data"), - "dolma3_mix_150b_1025": dolma3_mix_150b_1025_download().as_executor_step().cd("15d04ee"), - "dclm_baseline_wrong": dclm_baseline_wrong_download().as_executor_step(), - "dclm_baseline": dclm_baseline_download().as_executor_step().cd("a3b142c"), - "the_stack_dedup": the_stack_dedup_download().as_executor_step().cd("17cad72"), "proofpile_2": ( - proofpile_2_download() - .as_executor_step() - .cd("901a927/huggingface.co/datasets/EleutherAI/proof-pile-2/resolve/901a927") + _dl("raw/proof-pile-2", "EleutherAI/proof-pile-2", "901a927", "raw/proof-pile-2-f1b1d8").cd( + "901a927/huggingface.co/datasets/EleutherAI/proof-pile-2/resolve/901a927" + ) ), "the_pile_openwebtext2": ( - the_pile_openwebtext2_download() - .as_executor_step() - .cd("1de27c6/huggingface.co/datasets/vietgpt/the_pile_openwebtext2/resolve/1de27c6") + _dl("raw/the_pile_openwebtext2", "vietgpt/the_pile_openwebtext2", "1de27c6", "raw/the_pile_openwebtext2").cd( + "1de27c6/huggingface.co/datasets/vietgpt/the_pile_openwebtext2/resolve/1de27c6" + ) ), - "starcoderdata": starcoderdata_download().as_executor_step(), + "starcoderdata": _dl("raw/starcoderdata", "bigcode/starcoderdata", "9fc30b5", "raw/starcoderdata-720c8c"), } diff --git a/lib/marin/src/marin/datakit/download/pretraining.py b/lib/marin/src/marin/datakit/download/pretraining.py deleted file mode 100644 index 3300820ba3..0000000000 --- a/lib/marin/src/marin/datakit/download/pretraining.py +++ /dev/null @@ -1,119 +0,0 @@ -# Copyright The Marin Authors -# SPDX-License-Identifier: Apache-2.0 - -"""Pre-defined download steps for common pretraining datasets. - -Each function returns a StepSpec for downloading a specific dataset from -HuggingFace. These are the canonical definitions — experiments should -import from here rather than defining download steps inline. - -For datasets where the actual data lives in a subdirectory of the download -(e.g. fineweb-edu has data under ``data/``), the function returns the -StepSpec for the base download. Consumers that need the subdirectory path -should use ``step.output_path + "/data"`` or convert to ExecutorStep and -use ``.cd("data")``. -""" - -from marin.datakit.download.huggingface import download_hf_step -from marin.execution.step_spec import StepSpec - - -def fineweb_download() -> StepSpec: - return download_hf_step( - "raw/fineweb", - hf_dataset_id="HuggingFaceFW/fineweb", - revision="cd85054", - override_output_path="raw/fineweb", - ) - - -def fineweb_edu_download() -> StepSpec: - """Base download for fineweb-edu. Data is under the ``data/`` subdirectory.""" - return download_hf_step( - "raw/fineweb-edu", - hf_dataset_id="HuggingFaceFW/fineweb-edu", - revision="87f0914", - override_output_path="raw/fineweb-edu-87f0914", - ) - - -def slimpajama_download() -> StepSpec: - return download_hf_step( - "raw/SlimPajama-627B", - hf_dataset_id="cerebras/SlimPajama-627B", - revision="2d0accd", - override_output_path="raw/SlimPajama-627B-262830", - ) - - -def slimpajama_6b_download() -> StepSpec: - return download_hf_step( - "raw/SlimPajama-6B", - hf_dataset_id="DKYoon/SlimPajama-6B", - revision="b5f90f4", - override_output_path="raw/SlimPajama-6B-be35b7", - ) - - -def dolma3_mix_150b_1025_download() -> StepSpec: - return download_hf_step( - "raw/dolma3_mix-150B-1025", - hf_dataset_id="allenai/dolma3_mix-150B-1025", - revision="15d04ee", - override_output_path="raw/dolma3_mix-150B-1025-15d04ee", - ) - - -def dclm_baseline_download() -> StepSpec: - return download_hf_step( - "raw/dclm-baseline-1.0", - hf_dataset_id="mlfoundations/dclm-baseline-1.0", - revision="a3b142c", - override_output_path="raw/dclm", - ) - - -def the_stack_dedup_download() -> StepSpec: - return download_hf_step( - "raw/the-stack-dedup", - hf_dataset_id="bigcode/the-stack-dedup", - revision="17cad72", - override_output_path="raw/the-stack-dedup-4ba450", - ) - - -def proofpile_2_download() -> StepSpec: - return download_hf_step( - "raw/proof-pile-2", - hf_dataset_id="EleutherAI/proof-pile-2", - revision="901a927", - override_output_path="raw/proof-pile-2-f1b1d8", - ) - - -def the_pile_openwebtext2_download() -> StepSpec: - return download_hf_step( - "raw/the_pile_openwebtext2", - hf_dataset_id="vietgpt/the_pile_openwebtext2", - revision="1de27c6", - override_output_path="raw/the_pile_openwebtext2", - ) - - -def starcoderdata_download() -> StepSpec: - return download_hf_step( - "raw/starcoderdata", - hf_dataset_id="bigcode/starcoderdata", - revision="9fc30b5", - override_output_path="raw/starcoderdata-720c8c", - ) - - -def dclm_baseline_wrong_download() -> StepSpec: - """Legacy download with incorrect path. Kept for backward compat.""" - return download_hf_step( - "raw/dclm-baseline-1.0", - hf_dataset_id="mlfoundations/dclm-baseline-1.0", - revision="a3b142c", - override_output_path="raw/dclm_WRONG_20250211/", - ) From ddb203797004cae44c687baa331f8f07e74673ba Mon Sep 17 00:00:00 2001 From: Rafal Wojdyla Date: Wed, 25 Mar 2026 14:33:25 -0700 Subject: [PATCH 15/56] Delete old marin.download/ shim directory All imports have been migrated to marin.datakit.download.*. The shim re-export layer has zero consumers and is now removed. Data files (ar5iv JSON, stackexchange TSV) moved to datakit/download/data/. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../download/data}/ar5iv-v04-2024.json | 0 .../download/data}/stackexchange/README.md | 0 .../data}/stackexchange/stackexchange-urls.tsv | 0 lib/marin/src/marin/download/__init__.py | 7 ------- lib/marin/src/marin/download/ar5iv/__init__.py | 2 -- lib/marin/src/marin/download/ar5iv/download.py | 7 ------- lib/marin/src/marin/download/dclm_hq/__init__.py | 2 -- .../marin/download/dclm_hq/download_dclm_hq_html.py | 9 --------- lib/marin/src/marin/download/filesystem/__init__.py | 2 -- lib/marin/src/marin/download/filesystem/transfer.py | 5 ----- .../src/marin/download/huggingface/__init__.py | 2 -- .../src/marin/download/huggingface/download_hf.py | 13 ------------- .../download/huggingface/stream_remove_columns.py | 8 -------- .../marin/download/huggingface/upload_gcs_to_hf.py | 10 ---------- .../src/marin/download/nemotron_cc/__init__.py | 2 -- .../download/nemotron_cc/download_nemotron_cc.py | 6 ------ .../src/marin/download/uncheatable_eval/__init__.py | 2 -- .../src/marin/download/uncheatable_eval/download.py | 12 ------------ lib/marin/src/marin/download/wikipedia/__init__.py | 2 -- lib/marin/src/marin/download/wikipedia/download.py | 7 ------- 20 files changed, 98 deletions(-) rename lib/marin/src/marin/{download/ar5iv => datakit/download/data}/ar5iv-v04-2024.json (100%) rename lib/marin/src/marin/{download => datakit/download/data}/stackexchange/README.md (100%) rename lib/marin/src/marin/{download => datakit/download/data}/stackexchange/stackexchange-urls.tsv (100%) delete mode 100644 lib/marin/src/marin/download/__init__.py delete mode 100644 lib/marin/src/marin/download/ar5iv/__init__.py delete mode 100644 lib/marin/src/marin/download/ar5iv/download.py delete mode 100644 lib/marin/src/marin/download/dclm_hq/__init__.py delete mode 100644 lib/marin/src/marin/download/dclm_hq/download_dclm_hq_html.py delete mode 100644 lib/marin/src/marin/download/filesystem/__init__.py delete mode 100644 lib/marin/src/marin/download/filesystem/transfer.py delete mode 100644 lib/marin/src/marin/download/huggingface/__init__.py delete mode 100644 lib/marin/src/marin/download/huggingface/download_hf.py delete mode 100644 lib/marin/src/marin/download/huggingface/stream_remove_columns.py delete mode 100644 lib/marin/src/marin/download/huggingface/upload_gcs_to_hf.py delete mode 100644 lib/marin/src/marin/download/nemotron_cc/__init__.py delete mode 100644 lib/marin/src/marin/download/nemotron_cc/download_nemotron_cc.py delete mode 100644 lib/marin/src/marin/download/uncheatable_eval/__init__.py delete mode 100644 lib/marin/src/marin/download/uncheatable_eval/download.py delete mode 100644 lib/marin/src/marin/download/wikipedia/__init__.py delete mode 100644 lib/marin/src/marin/download/wikipedia/download.py diff --git a/lib/marin/src/marin/download/ar5iv/ar5iv-v04-2024.json b/lib/marin/src/marin/datakit/download/data/ar5iv-v04-2024.json similarity index 100% rename from lib/marin/src/marin/download/ar5iv/ar5iv-v04-2024.json rename to lib/marin/src/marin/datakit/download/data/ar5iv-v04-2024.json diff --git a/lib/marin/src/marin/download/stackexchange/README.md b/lib/marin/src/marin/datakit/download/data/stackexchange/README.md similarity index 100% rename from lib/marin/src/marin/download/stackexchange/README.md rename to lib/marin/src/marin/datakit/download/data/stackexchange/README.md diff --git a/lib/marin/src/marin/download/stackexchange/stackexchange-urls.tsv b/lib/marin/src/marin/datakit/download/data/stackexchange/stackexchange-urls.tsv similarity index 100% rename from lib/marin/src/marin/download/stackexchange/stackexchange-urls.tsv rename to lib/marin/src/marin/datakit/download/data/stackexchange/stackexchange-urls.tsv diff --git a/lib/marin/src/marin/download/__init__.py b/lib/marin/src/marin/download/__init__.py deleted file mode 100644 index 26067cbf97..0000000000 --- a/lib/marin/src/marin/download/__init__.py +++ /dev/null @@ -1,7 +0,0 @@ -# Copyright The Marin Authors -# SPDX-License-Identifier: Apache-2.0 -# Backward-compat shim. Canonical location: marin.datakit.download - -from marin.datakit.download.huggingface import DownloadConfig as HfDownloadConfig -from marin.datakit.download.huggingface import download_hf -from marin.datakit.download.huggingface import download_hf as download_hf_ungated diff --git a/lib/marin/src/marin/download/ar5iv/__init__.py b/lib/marin/src/marin/download/ar5iv/__init__.py deleted file mode 100644 index ec8bc038b7..0000000000 --- a/lib/marin/src/marin/download/ar5iv/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -# Copyright The Marin Authors -# SPDX-License-Identifier: Apache-2.0 diff --git a/lib/marin/src/marin/download/ar5iv/download.py b/lib/marin/src/marin/download/ar5iv/download.py deleted file mode 100644 index 1a64dbf93e..0000000000 --- a/lib/marin/src/marin/download/ar5iv/download.py +++ /dev/null @@ -1,7 +0,0 @@ -# Copyright The Marin Authors -# SPDX-License-Identifier: Apache-2.0 -# Backward-compat shim. Canonical location: marin.datakit.download.ar5iv - -from marin.datakit.download.ar5iv import Ar5ivDownloadConfig as DownloadConfig # noqa: F401 - used by tests -from marin.datakit.download.ar5iv import download as download -from marin.datakit.download.ar5iv import process_shard as process_shard diff --git a/lib/marin/src/marin/download/dclm_hq/__init__.py b/lib/marin/src/marin/download/dclm_hq/__init__.py deleted file mode 100644 index ec8bc038b7..0000000000 --- a/lib/marin/src/marin/download/dclm_hq/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -# Copyright The Marin Authors -# SPDX-License-Identifier: Apache-2.0 diff --git a/lib/marin/src/marin/download/dclm_hq/download_dclm_hq_html.py b/lib/marin/src/marin/download/dclm_hq/download_dclm_hq_html.py deleted file mode 100644 index a98513e7df..0000000000 --- a/lib/marin/src/marin/download/dclm_hq/download_dclm_hq_html.py +++ /dev/null @@ -1,9 +0,0 @@ -# Copyright The Marin Authors -# SPDX-License-Identifier: Apache-2.0 -# Backward-compat shim. Canonical location: marin.datakit.download.dclm_hq - -from marin.datakit.download.dclm_hq import FileTask as FileTask -from marin.datakit.download.dclm_hq import extract_dclm_hq_dump as extract_dclm_hq_dump -from marin.datakit.download.dclm_hq import fetch_warc_from_cc as fetch_warc_from_cc -from marin.datakit.download.dclm_hq import find_html_in_cc as find_html_in_cc -from marin.datakit.download.dclm_hq import process_file as process_file diff --git a/lib/marin/src/marin/download/filesystem/__init__.py b/lib/marin/src/marin/download/filesystem/__init__.py deleted file mode 100644 index ec8bc038b7..0000000000 --- a/lib/marin/src/marin/download/filesystem/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -# Copyright The Marin Authors -# SPDX-License-Identifier: Apache-2.0 diff --git a/lib/marin/src/marin/download/filesystem/transfer.py b/lib/marin/src/marin/download/filesystem/transfer.py deleted file mode 100644 index 045a360623..0000000000 --- a/lib/marin/src/marin/download/filesystem/transfer.py +++ /dev/null @@ -1,5 +0,0 @@ -# Copyright The Marin Authors -# SPDX-License-Identifier: Apache-2.0 -# Backward-compat shim. Canonical location: marin.datakit.download.filesystem - -from marin.datakit.download.filesystem import transfer_files as transfer_files diff --git a/lib/marin/src/marin/download/huggingface/__init__.py b/lib/marin/src/marin/download/huggingface/__init__.py deleted file mode 100644 index ec8bc038b7..0000000000 --- a/lib/marin/src/marin/download/huggingface/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -# Copyright The Marin Authors -# SPDX-License-Identifier: Apache-2.0 diff --git a/lib/marin/src/marin/download/huggingface/download_hf.py b/lib/marin/src/marin/download/huggingface/download_hf.py deleted file mode 100644 index 2dd0177806..0000000000 --- a/lib/marin/src/marin/download/huggingface/download_hf.py +++ /dev/null @@ -1,13 +0,0 @@ -# Copyright The Marin Authors -# SPDX-License-Identifier: Apache-2.0 -# Backward-compat shim. Canonical location: marin.datakit.download.huggingface - -from marin.datakit.download.huggingface import DownloadConfig as DownloadConfig -from marin.datakit.download.huggingface import _relative_path_in_source as _relative_path_in_source -from marin.datakit.download.huggingface import download_hf as download_hf -from marin.datakit.download.huggingface import ensure_fsspec_path_writable as ensure_fsspec_path_writable -from marin.datakit.download.huggingface import main as main -from marin.datakit.download.huggingface import stream_file_to_fsspec as stream_file_to_fsspec - -if __name__ == "__main__": - main() diff --git a/lib/marin/src/marin/download/huggingface/stream_remove_columns.py b/lib/marin/src/marin/download/huggingface/stream_remove_columns.py deleted file mode 100644 index 68a44db40c..0000000000 --- a/lib/marin/src/marin/download/huggingface/stream_remove_columns.py +++ /dev/null @@ -1,8 +0,0 @@ -# Copyright The Marin Authors -# SPDX-License-Identifier: Apache-2.0 -# Backward-compat shim. Canonical location: marin.datakit.download.stream_remove_columns - -from marin.datakit.download.stream_remove_columns import DatasetConfig as DatasetConfig -from marin.datakit.download.stream_remove_columns import get_file_tasks as get_file_tasks -from marin.datakit.download.stream_remove_columns import prune_hf_dataset as prune_hf_dataset -from marin.datakit.download.stream_remove_columns import prune_stream_and_save as prune_stream_and_save diff --git a/lib/marin/src/marin/download/huggingface/upload_gcs_to_hf.py b/lib/marin/src/marin/download/huggingface/upload_gcs_to_hf.py deleted file mode 100644 index 43c368f5b9..0000000000 --- a/lib/marin/src/marin/download/huggingface/upload_gcs_to_hf.py +++ /dev/null @@ -1,10 +0,0 @@ -# Copyright The Marin Authors -# SPDX-License-Identifier: Apache-2.0 -# Backward-compat shim. Canonical location: marin.datakit.download.upload_gcs_to_hf - -from marin.datakit.download.upload_gcs_to_hf import UploadConfig as UploadConfig -from marin.datakit.download.upload_gcs_to_hf import main as main -from marin.datakit.download.upload_gcs_to_hf import upload_gcs_to_hf as upload_gcs_to_hf - -if __name__ == "__main__": - main() diff --git a/lib/marin/src/marin/download/nemotron_cc/__init__.py b/lib/marin/src/marin/download/nemotron_cc/__init__.py deleted file mode 100644 index ec8bc038b7..0000000000 --- a/lib/marin/src/marin/download/nemotron_cc/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -# Copyright The Marin Authors -# SPDX-License-Identifier: Apache-2.0 diff --git a/lib/marin/src/marin/download/nemotron_cc/download_nemotron_cc.py b/lib/marin/src/marin/download/nemotron_cc/download_nemotron_cc.py deleted file mode 100644 index c7e8e16e54..0000000000 --- a/lib/marin/src/marin/download/nemotron_cc/download_nemotron_cc.py +++ /dev/null @@ -1,6 +0,0 @@ -# Copyright The Marin Authors -# SPDX-License-Identifier: Apache-2.0 -# Backward-compat shim. Canonical location: marin.datakit.download.nemotron_cc - -from marin.datakit.download.nemotron_cc import download_nemotron_cc as download_nemotron_cc -from marin.datakit.download.nemotron_cc import download_single_nemotron_path as download_single_nemotron_path diff --git a/lib/marin/src/marin/download/uncheatable_eval/__init__.py b/lib/marin/src/marin/download/uncheatable_eval/__init__.py deleted file mode 100644 index ec8bc038b7..0000000000 --- a/lib/marin/src/marin/download/uncheatable_eval/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -# Copyright The Marin Authors -# SPDX-License-Identifier: Apache-2.0 diff --git a/lib/marin/src/marin/download/uncheatable_eval/download.py b/lib/marin/src/marin/download/uncheatable_eval/download.py deleted file mode 100644 index 9baf9db8ad..0000000000 --- a/lib/marin/src/marin/download/uncheatable_eval/download.py +++ /dev/null @@ -1,12 +0,0 @@ -# Copyright The Marin Authors -# SPDX-License-Identifier: Apache-2.0 -# Backward-compat shim. Canonical location: marin.datakit.download.uncheatable_eval - -from marin.datakit.download.uncheatable_eval import UncheatableEvalDataset as UncheatableEvalDataset -from marin.datakit.download.uncheatable_eval import ( - UncheatableEvalDownloadConfig as UncheatableEvalDownloadConfig, -) -from marin.datakit.download.uncheatable_eval import ( - download_latest_uncheatable_eval as download_latest_uncheatable_eval, -) -from marin.datakit.download.uncheatable_eval import make_uncheatable_eval_step as make_uncheatable_eval_step diff --git a/lib/marin/src/marin/download/wikipedia/__init__.py b/lib/marin/src/marin/download/wikipedia/__init__.py deleted file mode 100644 index ec8bc038b7..0000000000 --- a/lib/marin/src/marin/download/wikipedia/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -# Copyright The Marin Authors -# SPDX-License-Identifier: Apache-2.0 diff --git a/lib/marin/src/marin/download/wikipedia/download.py b/lib/marin/src/marin/download/wikipedia/download.py deleted file mode 100644 index 9b50143040..0000000000 --- a/lib/marin/src/marin/download/wikipedia/download.py +++ /dev/null @@ -1,7 +0,0 @@ -# Copyright The Marin Authors -# SPDX-License-Identifier: Apache-2.0 -# Backward-compat shim. Canonical location: marin.datakit.download.wikipedia - -from marin.datakit.download.wikipedia import download as download -from marin.datakit.download.wikipedia import download_tar as download_tar -from marin.datakit.download.wikipedia import process_file as process_file From f4f7cabf9a34880a519641fa1a20fdd506dc1581 Mon Sep 17 00:00:00 2001 From: Rafal Wojdyla Date: Wed, 25 Mar 2026 14:36:43 -0700 Subject: [PATCH 16/56] Remove unused stackexchange data files The TSV and README were not referenced by any code. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../download/data/stackexchange/README.md | 20 -- .../data/stackexchange/stackexchange-urls.tsv | 183 ------------------ 2 files changed, 203 deletions(-) delete mode 100644 lib/marin/src/marin/datakit/download/data/stackexchange/README.md delete mode 100644 lib/marin/src/marin/datakit/download/data/stackexchange/stackexchange-urls.tsv diff --git a/lib/marin/src/marin/datakit/download/data/stackexchange/README.md b/lib/marin/src/marin/datakit/download/data/stackexchange/README.md deleted file mode 100644 index 295232a502..0000000000 --- a/lib/marin/src/marin/datakit/download/data/stackexchange/README.md +++ /dev/null @@ -1,20 +0,0 @@ -# Downloading Stackexchange Data - -Raw StackExchange dumps are available at https://archive.org/download/stackexchange. We use the dump from 2024-04-02. -We exclude "meta" sites and only use the main sites (i.e., we use "3dprinting.stackexchange.com.7z" but don't use -"3dprinting.meta.stackexchange.com.7z"). The full dump is approximately 100 GB. - -**Downloading Data to GCS**: To get the raw data, we use the GCS Storage Transfer Service to perform the data transfer. -To kick off the job, create `stackexchange-urls.tsv` using the following instructions (per @dlwh): - -- Go to `[https://archive.org/details/stackexchange](https://archive.org/details/stackexchange)` -- Expand the `7z` sidebar, copy all the names (w/ mouse) -- Paste into a text editor (i.e., VSCode) -- Run (sequence of find/replace commands - regex mode) - + Remove all " download" strings -- match on `download ` - + Remove all file sizes (e.g., 188M) -- match on `^\d.*?\d[KMG]` - + Remove all `meta` sites -- match on `.*\.meta\..*\n` - + Prepend URL Prefix `https://archive.org/download/stackexchange/` to each line - + Insert `TsvHttpData-1.0` on the first line - -Pass this file to the Storage Transfer Job CLI to kick off the transfer. diff --git a/lib/marin/src/marin/datakit/download/data/stackexchange/stackexchange-urls.tsv b/lib/marin/src/marin/datakit/download/data/stackexchange/stackexchange-urls.tsv deleted file mode 100644 index 763e0341da..0000000000 --- a/lib/marin/src/marin/datakit/download/data/stackexchange/stackexchange-urls.tsv +++ /dev/null @@ -1,183 +0,0 @@ -TsvHttpData-1.0 -https://archive.org/download/stackexchange/3dprinting.stackexchange.com.7z -https://archive.org/download/stackexchange/academia.stackexchange.com.7z -https://archive.org/download/stackexchange/ai.stackexchange.com.7z -https://archive.org/download/stackexchange/android.stackexchange.com.7z -https://archive.org/download/stackexchange/anime.stackexchange.com.7z -https://archive.org/download/stackexchange/apple.stackexchange.com.7z -https://archive.org/download/stackexchange/arduino.stackexchange.com.7z -https://archive.org/download/stackexchange/askubuntu.com.7z -https://archive.org/download/stackexchange/astronomy.stackexchange.com.7z -https://archive.org/download/stackexchange/aviation.stackexchange.com.7z -https://archive.org/download/stackexchange/avp.stackexchange.com.7z -https://archive.org/download/stackexchange/beer.stackexchange.com.7z -https://archive.org/download/stackexchange/bicycles.stackexchange.com.7z -https://archive.org/download/stackexchange/bioacoustics.stackexchange.com.7z -https://archive.org/download/stackexchange/bioinformatics.stackexchange.com.7z -https://archive.org/download/stackexchange/biology.stackexchange.com.7z -https://archive.org/download/stackexchange/bitcoin.stackexchange.com.7z -https://archive.org/download/stackexchange/blender.stackexchange.com.7z -https://archive.org/download/stackexchange/boardgames.stackexchange.com.7z -https://archive.org/download/stackexchange/bricks.stackexchange.com.7z -https://archive.org/download/stackexchange/buddhism.stackexchange.com.7z -https://archive.org/download/stackexchange/cardano.stackexchange.com.7z -https://archive.org/download/stackexchange/chemistry.stackexchange.com.7z -https://archive.org/download/stackexchange/chess.stackexchange.com.7z -https://archive.org/download/stackexchange/chinese.stackexchange.com.7z -https://archive.org/download/stackexchange/christianity.stackexchange.com.7z -https://archive.org/download/stackexchange/civicrm.stackexchange.com.7z -https://archive.org/download/stackexchange/codegolf.stackexchange.com.7z -https://archive.org/download/stackexchange/codereview.stackexchange.com.7z -https://archive.org/download/stackexchange/coffee.stackexchange.com.7z -https://archive.org/download/stackexchange/cogsci.stackexchange.com.7z -https://archive.org/download/stackexchange/computergraphics.stackexchange.com.7z -https://archive.org/download/stackexchange/conlang.stackexchange.com.7z -https://archive.org/download/stackexchange/cooking.stackexchange.com.7z -https://archive.org/download/stackexchange/craftcms.stackexchange.com.7z -https://archive.org/download/stackexchange/crafts.stackexchange.com.7z -https://archive.org/download/stackexchange/crypto.stackexchange.com.7z -https://archive.org/download/stackexchange/cs.stackexchange.com.7z -https://archive.org/download/stackexchange/cseducators.stackexchange.com.7z -https://archive.org/download/stackexchange/cstheory.stackexchange.com.7z -https://archive.org/download/stackexchange/datascience.stackexchange.com.7z -https://archive.org/download/stackexchange/dba.stackexchange.com.7z -https://archive.org/download/stackexchange/devops.stackexchange.com.7z -https://archive.org/download/stackexchange/diy.stackexchange.com.7z -https://archive.org/download/stackexchange/drones.stackexchange.com.7z -https://archive.org/download/stackexchange/drupal.stackexchange.com.7z -https://archive.org/download/stackexchange/dsp.stackexchange.com.7z -https://archive.org/download/stackexchange/earthscience.stackexchange.com.7z -https://archive.org/download/stackexchange/ebooks.stackexchange.com.7z -https://archive.org/download/stackexchange/economics.stackexchange.com.7z -https://archive.org/download/stackexchange/electronics.stackexchange.com.7z -https://archive.org/download/stackexchange/elementaryos.stackexchange.com.7z -https://archive.org/download/stackexchange/ell.stackexchange.com.7z -https://archive.org/download/stackexchange/emacs.stackexchange.com.7z -https://archive.org/download/stackexchange/engineering.stackexchange.com.7z -https://archive.org/download/stackexchange/english.stackexchange.com.7z -https://archive.org/download/stackexchange/eosio.stackexchange.com.7z -https://archive.org/download/stackexchange/es.stackoverflow.com.7z -https://archive.org/download/stackexchange/esperanto.stackexchange.com.7z -https://archive.org/download/stackexchange/ethereum.stackexchange.com.7z -https://archive.org/download/stackexchange/expatriates.stackexchange.com.7z -https://archive.org/download/stackexchange/expressionengine.stackexchange.com.7z -https://archive.org/download/stackexchange/fitness.stackexchange.com.7z -https://archive.org/download/stackexchange/freelancing.stackexchange.com.7z -https://archive.org/download/stackexchange/french.stackexchange.com.7z -https://archive.org/download/stackexchange/gamedev.stackexchange.com.7z -https://archive.org/download/stackexchange/gaming.stackexchange.com.7z -https://archive.org/download/stackexchange/gardening.stackexchange.com.7z -https://archive.org/download/stackexchange/genai.stackexchange.com.7z -https://archive.org/download/stackexchange/genealogy.stackexchange.com.7z -https://archive.org/download/stackexchange/german.stackexchange.com.7z -https://archive.org/download/stackexchange/gis.stackexchange.com.7z -https://archive.org/download/stackexchange/graphicdesign.stackexchange.com.7z -https://archive.org/download/stackexchange/ham.stackexchange.com.7z -https://archive.org/download/stackexchange/hardwarerecs.stackexchange.com.7z -https://archive.org/download/stackexchange/health.stackexchange.com.7z -https://archive.org/download/stackexchange/hermeneutics.stackexchange.com.7z -https://archive.org/download/stackexchange/hinduism.stackexchange.com.7z -https://archive.org/download/stackexchange/history.stackexchange.com.7z -https://archive.org/download/stackexchange/homebrew.stackexchange.com.7z -https://archive.org/download/stackexchange/hsm.stackexchange.com.7z -https://archive.org/download/stackexchange/interpersonal.stackexchange.com.7z -https://archive.org/download/stackexchange/iot.stackexchange.com.7z -https://archive.org/download/stackexchange/iota.stackexchange.com.7z -https://archive.org/download/stackexchange/islam.stackexchange.com.7z -https://archive.org/download/stackexchange/italian.stackexchange.com.7z -https://archive.org/download/stackexchange/ja.stackoverflow.com.7z -https://archive.org/download/stackexchange/japanese.stackexchange.com.7z -https://archive.org/download/stackexchange/joomla.stackexchange.com.7z -https://archive.org/download/stackexchange/judaism.stackexchange.com.7z -https://archive.org/download/stackexchange/korean.stackexchange.com.7z -https://archive.org/download/stackexchange/langdev.stackexchange.com.7z -https://archive.org/download/stackexchange/languagelearning.stackexchange.com.7z -https://archive.org/download/stackexchange/latin.stackexchange.com.7z -https://archive.org/download/stackexchange/law.stackexchange.com.7z -https://archive.org/download/stackexchange/lifehacks.stackexchange.com.7z -https://archive.org/download/stackexchange/linguistics.stackexchange.com.7z -https://archive.org/download/stackexchange/literature.stackexchange.com.7z -https://archive.org/download/stackexchange/magento.stackexchange.com.7z -https://archive.org/download/stackexchange/martialarts.stackexchange.com.7z -https://archive.org/download/stackexchange/materials.stackexchange.com.7z -https://archive.org/download/stackexchange/math.stackexchange.com.7z -https://archive.org/download/stackexchange/matheducators.stackexchange.com.7z -https://archive.org/download/stackexchange/mathematica.stackexchange.com.7z -https://archive.org/download/stackexchange/mathoverflow.net.7z -https://archive.org/download/stackexchange/mechanics.stackexchange.com.7z -https://archive.org/download/stackexchange/moderators.stackexchange.com.7z -https://archive.org/download/stackexchange/monero.stackexchange.com.7z -https://archive.org/download/stackexchange/money.stackexchange.com.7z -https://archive.org/download/stackexchange/movies.stackexchange.com.7z -https://archive.org/download/stackexchange/music.stackexchange.com.7z -https://archive.org/download/stackexchange/musicfans.stackexchange.com.7z -https://archive.org/download/stackexchange/mythology.stackexchange.com.7z -https://archive.org/download/stackexchange/networkengineering.stackexchange.com.7z -https://archive.org/download/stackexchange/opendata.stackexchange.com.7z -https://archive.org/download/stackexchange/opensource.stackexchange.com.7z -https://archive.org/download/stackexchange/or.stackexchange.com.7z -https://archive.org/download/stackexchange/outdoors.stackexchange.com.7z -https://archive.org/download/stackexchange/parenting.stackexchange.com.7z -https://archive.org/download/stackexchange/patents.stackexchange.com.7z -https://archive.org/download/stackexchange/pets.stackexchange.com.7z -https://archive.org/download/stackexchange/philosophy.stackexchange.com.7z -https://archive.org/download/stackexchange/photo.stackexchange.com.7z -https://archive.org/download/stackexchange/physics.stackexchange.com.7z -https://archive.org/download/stackexchange/pm.stackexchange.com.7z -https://archive.org/download/stackexchange/poker.stackexchange.com.7z -https://archive.org/download/stackexchange/politics.stackexchange.com.7z -https://archive.org/download/stackexchange/portuguese.stackexchange.com.7z -https://archive.org/download/stackexchange/proofassistants.stackexchange.com.7z -https://archive.org/download/stackexchange/pt.stackoverflow.com.7z -https://archive.org/download/stackexchange/puzzling.stackexchange.com.7z -https://archive.org/download/stackexchange/quant.stackexchange.com.7z -https://archive.org/download/stackexchange/quantumcomputing.stackexchange.com.7z -https://archive.org/download/stackexchange/raspberrypi.stackexchange.com.7z -https://archive.org/download/stackexchange/retrocomputing.stackexchange.com.7z -https://archive.org/download/stackexchange/reverseengineering.stackexchange.com.7z -https://archive.org/download/stackexchange/robotics.stackexchange.com.7z -https://archive.org/download/stackexchange/rpg.stackexchange.com.7z -https://archive.org/download/stackexchange/ru.stackoverflow.com.7z -https://archive.org/download/stackexchange/rus.stackexchange.com.7z -https://archive.org/download/stackexchange/russian.stackexchange.com.7z -https://archive.org/download/stackexchange/salesforce.stackexchange.com.7z -https://archive.org/download/stackexchange/scicomp.stackexchange.com.7z -https://archive.org/download/stackexchange/scifi.stackexchange.com.7z -https://archive.org/download/stackexchange/security.stackexchange.com.7z -https://archive.org/download/stackexchange/serverfault.com.7z -https://archive.org/download/stackexchange/sharepoint.stackexchange.com.7z -https://archive.org/download/stackexchange/sitecore.stackexchange.com.7z -https://archive.org/download/stackexchange/skeptics.stackexchange.com.7z -https://archive.org/download/stackexchange/softwareengineering.stackexchange.com.7z -https://archive.org/download/stackexchange/softwarerecs.stackexchange.com.7z -https://archive.org/download/stackexchange/solana.stackexchange.com.7z -https://archive.org/download/stackexchange/sound.stackexchange.com.7z -https://archive.org/download/stackexchange/space.stackexchange.com.7z -https://archive.org/download/stackexchange/spanish.stackexchange.com.7z -https://archive.org/download/stackexchange/sports.stackexchange.com.7z -https://archive.org/download/stackexchange/sqa.stackexchange.com.7z -https://archive.org/download/stackexchange/stackapps.com.7z -https://archive.org/download/stackexchange/stackoverflow.com-Posts.7z -https://archive.org/download/stackexchange/stats.stackexchange.com.7z -https://archive.org/download/stackexchange/stellar.stackexchange.com.7z -https://archive.org/download/stackexchange/substrate.stackexchange.com.7z -https://archive.org/download/stackexchange/superuser.com.7z -https://archive.org/download/stackexchange/sustainability.stackexchange.com.7z -https://archive.org/download/stackexchange/tex.stackexchange.com.7z -https://archive.org/download/stackexchange/tezos.stackexchange.com.7z -https://archive.org/download/stackexchange/tor.stackexchange.com.7z -https://archive.org/download/stackexchange/travel.stackexchange.com.7z -https://archive.org/download/stackexchange/tridion.stackexchange.com.7z -https://archive.org/download/stackexchange/ukrainian.stackexchange.com.7z -https://archive.org/download/stackexchange/unix.stackexchange.com.7z -https://archive.org/download/stackexchange/ux.stackexchange.com.7z -https://archive.org/download/stackexchange/vegetarianism.stackexchange.com.7z -https://archive.org/download/stackexchange/vi.stackexchange.com.7z -https://archive.org/download/stackexchange/webapps.stackexchange.com.7z -https://archive.org/download/stackexchange/webmasters.stackexchange.com.7z -https://archive.org/download/stackexchange/windowsphone.stackexchange.com.7z -https://archive.org/download/stackexchange/woodworking.stackexchange.com.7z -https://archive.org/download/stackexchange/wordpress.stackexchange.com.7z -https://archive.org/download/stackexchange/workplace.stackexchange.com.7z -https://archive.org/download/stackexchange/worldbuilding.stackexchange.com.7z -https://archive.org/download/stackexchange/writers.stackexchange.com.7z From 64fd456507e6f29c015323247a71007b5f41aad9 Mon Sep 17 00:00:00 2001 From: Rafal Wojdyla Date: Wed, 25 Mar 2026 14:37:54 -0700 Subject: [PATCH 17/56] Revert "Remove unused stackexchange data files" This reverts commit ced1f4f307809d14f4e50444c943cf865d3e39fa. --- .../download/data/stackexchange/README.md | 20 ++ .../data/stackexchange/stackexchange-urls.tsv | 183 ++++++++++++++++++ 2 files changed, 203 insertions(+) create mode 100644 lib/marin/src/marin/datakit/download/data/stackexchange/README.md create mode 100644 lib/marin/src/marin/datakit/download/data/stackexchange/stackexchange-urls.tsv diff --git a/lib/marin/src/marin/datakit/download/data/stackexchange/README.md b/lib/marin/src/marin/datakit/download/data/stackexchange/README.md new file mode 100644 index 0000000000..295232a502 --- /dev/null +++ b/lib/marin/src/marin/datakit/download/data/stackexchange/README.md @@ -0,0 +1,20 @@ +# Downloading Stackexchange Data + +Raw StackExchange dumps are available at https://archive.org/download/stackexchange. We use the dump from 2024-04-02. +We exclude "meta" sites and only use the main sites (i.e., we use "3dprinting.stackexchange.com.7z" but don't use +"3dprinting.meta.stackexchange.com.7z"). The full dump is approximately 100 GB. + +**Downloading Data to GCS**: To get the raw data, we use the GCS Storage Transfer Service to perform the data transfer. +To kick off the job, create `stackexchange-urls.tsv` using the following instructions (per @dlwh): + +- Go to `[https://archive.org/details/stackexchange](https://archive.org/details/stackexchange)` +- Expand the `7z` sidebar, copy all the names (w/ mouse) +- Paste into a text editor (i.e., VSCode) +- Run (sequence of find/replace commands - regex mode) + + Remove all " download" strings -- match on `download ` + + Remove all file sizes (e.g., 188M) -- match on `^\d.*?\d[KMG]` + + Remove all `meta` sites -- match on `.*\.meta\..*\n` + + Prepend URL Prefix `https://archive.org/download/stackexchange/` to each line + + Insert `TsvHttpData-1.0` on the first line + +Pass this file to the Storage Transfer Job CLI to kick off the transfer. diff --git a/lib/marin/src/marin/datakit/download/data/stackexchange/stackexchange-urls.tsv b/lib/marin/src/marin/datakit/download/data/stackexchange/stackexchange-urls.tsv new file mode 100644 index 0000000000..763e0341da --- /dev/null +++ b/lib/marin/src/marin/datakit/download/data/stackexchange/stackexchange-urls.tsv @@ -0,0 +1,183 @@ +TsvHttpData-1.0 +https://archive.org/download/stackexchange/3dprinting.stackexchange.com.7z +https://archive.org/download/stackexchange/academia.stackexchange.com.7z +https://archive.org/download/stackexchange/ai.stackexchange.com.7z +https://archive.org/download/stackexchange/android.stackexchange.com.7z +https://archive.org/download/stackexchange/anime.stackexchange.com.7z +https://archive.org/download/stackexchange/apple.stackexchange.com.7z +https://archive.org/download/stackexchange/arduino.stackexchange.com.7z +https://archive.org/download/stackexchange/askubuntu.com.7z +https://archive.org/download/stackexchange/astronomy.stackexchange.com.7z +https://archive.org/download/stackexchange/aviation.stackexchange.com.7z +https://archive.org/download/stackexchange/avp.stackexchange.com.7z +https://archive.org/download/stackexchange/beer.stackexchange.com.7z +https://archive.org/download/stackexchange/bicycles.stackexchange.com.7z +https://archive.org/download/stackexchange/bioacoustics.stackexchange.com.7z +https://archive.org/download/stackexchange/bioinformatics.stackexchange.com.7z +https://archive.org/download/stackexchange/biology.stackexchange.com.7z +https://archive.org/download/stackexchange/bitcoin.stackexchange.com.7z +https://archive.org/download/stackexchange/blender.stackexchange.com.7z +https://archive.org/download/stackexchange/boardgames.stackexchange.com.7z +https://archive.org/download/stackexchange/bricks.stackexchange.com.7z +https://archive.org/download/stackexchange/buddhism.stackexchange.com.7z +https://archive.org/download/stackexchange/cardano.stackexchange.com.7z +https://archive.org/download/stackexchange/chemistry.stackexchange.com.7z +https://archive.org/download/stackexchange/chess.stackexchange.com.7z +https://archive.org/download/stackexchange/chinese.stackexchange.com.7z +https://archive.org/download/stackexchange/christianity.stackexchange.com.7z +https://archive.org/download/stackexchange/civicrm.stackexchange.com.7z +https://archive.org/download/stackexchange/codegolf.stackexchange.com.7z +https://archive.org/download/stackexchange/codereview.stackexchange.com.7z +https://archive.org/download/stackexchange/coffee.stackexchange.com.7z +https://archive.org/download/stackexchange/cogsci.stackexchange.com.7z +https://archive.org/download/stackexchange/computergraphics.stackexchange.com.7z +https://archive.org/download/stackexchange/conlang.stackexchange.com.7z +https://archive.org/download/stackexchange/cooking.stackexchange.com.7z +https://archive.org/download/stackexchange/craftcms.stackexchange.com.7z +https://archive.org/download/stackexchange/crafts.stackexchange.com.7z +https://archive.org/download/stackexchange/crypto.stackexchange.com.7z +https://archive.org/download/stackexchange/cs.stackexchange.com.7z +https://archive.org/download/stackexchange/cseducators.stackexchange.com.7z +https://archive.org/download/stackexchange/cstheory.stackexchange.com.7z +https://archive.org/download/stackexchange/datascience.stackexchange.com.7z +https://archive.org/download/stackexchange/dba.stackexchange.com.7z +https://archive.org/download/stackexchange/devops.stackexchange.com.7z +https://archive.org/download/stackexchange/diy.stackexchange.com.7z +https://archive.org/download/stackexchange/drones.stackexchange.com.7z +https://archive.org/download/stackexchange/drupal.stackexchange.com.7z +https://archive.org/download/stackexchange/dsp.stackexchange.com.7z +https://archive.org/download/stackexchange/earthscience.stackexchange.com.7z +https://archive.org/download/stackexchange/ebooks.stackexchange.com.7z +https://archive.org/download/stackexchange/economics.stackexchange.com.7z +https://archive.org/download/stackexchange/electronics.stackexchange.com.7z +https://archive.org/download/stackexchange/elementaryos.stackexchange.com.7z +https://archive.org/download/stackexchange/ell.stackexchange.com.7z +https://archive.org/download/stackexchange/emacs.stackexchange.com.7z +https://archive.org/download/stackexchange/engineering.stackexchange.com.7z +https://archive.org/download/stackexchange/english.stackexchange.com.7z +https://archive.org/download/stackexchange/eosio.stackexchange.com.7z +https://archive.org/download/stackexchange/es.stackoverflow.com.7z +https://archive.org/download/stackexchange/esperanto.stackexchange.com.7z +https://archive.org/download/stackexchange/ethereum.stackexchange.com.7z +https://archive.org/download/stackexchange/expatriates.stackexchange.com.7z +https://archive.org/download/stackexchange/expressionengine.stackexchange.com.7z +https://archive.org/download/stackexchange/fitness.stackexchange.com.7z +https://archive.org/download/stackexchange/freelancing.stackexchange.com.7z +https://archive.org/download/stackexchange/french.stackexchange.com.7z +https://archive.org/download/stackexchange/gamedev.stackexchange.com.7z +https://archive.org/download/stackexchange/gaming.stackexchange.com.7z +https://archive.org/download/stackexchange/gardening.stackexchange.com.7z +https://archive.org/download/stackexchange/genai.stackexchange.com.7z +https://archive.org/download/stackexchange/genealogy.stackexchange.com.7z +https://archive.org/download/stackexchange/german.stackexchange.com.7z +https://archive.org/download/stackexchange/gis.stackexchange.com.7z +https://archive.org/download/stackexchange/graphicdesign.stackexchange.com.7z +https://archive.org/download/stackexchange/ham.stackexchange.com.7z +https://archive.org/download/stackexchange/hardwarerecs.stackexchange.com.7z +https://archive.org/download/stackexchange/health.stackexchange.com.7z +https://archive.org/download/stackexchange/hermeneutics.stackexchange.com.7z +https://archive.org/download/stackexchange/hinduism.stackexchange.com.7z +https://archive.org/download/stackexchange/history.stackexchange.com.7z +https://archive.org/download/stackexchange/homebrew.stackexchange.com.7z +https://archive.org/download/stackexchange/hsm.stackexchange.com.7z +https://archive.org/download/stackexchange/interpersonal.stackexchange.com.7z +https://archive.org/download/stackexchange/iot.stackexchange.com.7z +https://archive.org/download/stackexchange/iota.stackexchange.com.7z +https://archive.org/download/stackexchange/islam.stackexchange.com.7z +https://archive.org/download/stackexchange/italian.stackexchange.com.7z +https://archive.org/download/stackexchange/ja.stackoverflow.com.7z +https://archive.org/download/stackexchange/japanese.stackexchange.com.7z +https://archive.org/download/stackexchange/joomla.stackexchange.com.7z +https://archive.org/download/stackexchange/judaism.stackexchange.com.7z +https://archive.org/download/stackexchange/korean.stackexchange.com.7z +https://archive.org/download/stackexchange/langdev.stackexchange.com.7z +https://archive.org/download/stackexchange/languagelearning.stackexchange.com.7z +https://archive.org/download/stackexchange/latin.stackexchange.com.7z +https://archive.org/download/stackexchange/law.stackexchange.com.7z +https://archive.org/download/stackexchange/lifehacks.stackexchange.com.7z +https://archive.org/download/stackexchange/linguistics.stackexchange.com.7z +https://archive.org/download/stackexchange/literature.stackexchange.com.7z +https://archive.org/download/stackexchange/magento.stackexchange.com.7z +https://archive.org/download/stackexchange/martialarts.stackexchange.com.7z +https://archive.org/download/stackexchange/materials.stackexchange.com.7z +https://archive.org/download/stackexchange/math.stackexchange.com.7z +https://archive.org/download/stackexchange/matheducators.stackexchange.com.7z +https://archive.org/download/stackexchange/mathematica.stackexchange.com.7z +https://archive.org/download/stackexchange/mathoverflow.net.7z +https://archive.org/download/stackexchange/mechanics.stackexchange.com.7z +https://archive.org/download/stackexchange/moderators.stackexchange.com.7z +https://archive.org/download/stackexchange/monero.stackexchange.com.7z +https://archive.org/download/stackexchange/money.stackexchange.com.7z +https://archive.org/download/stackexchange/movies.stackexchange.com.7z +https://archive.org/download/stackexchange/music.stackexchange.com.7z +https://archive.org/download/stackexchange/musicfans.stackexchange.com.7z +https://archive.org/download/stackexchange/mythology.stackexchange.com.7z +https://archive.org/download/stackexchange/networkengineering.stackexchange.com.7z +https://archive.org/download/stackexchange/opendata.stackexchange.com.7z +https://archive.org/download/stackexchange/opensource.stackexchange.com.7z +https://archive.org/download/stackexchange/or.stackexchange.com.7z +https://archive.org/download/stackexchange/outdoors.stackexchange.com.7z +https://archive.org/download/stackexchange/parenting.stackexchange.com.7z +https://archive.org/download/stackexchange/patents.stackexchange.com.7z +https://archive.org/download/stackexchange/pets.stackexchange.com.7z +https://archive.org/download/stackexchange/philosophy.stackexchange.com.7z +https://archive.org/download/stackexchange/photo.stackexchange.com.7z +https://archive.org/download/stackexchange/physics.stackexchange.com.7z +https://archive.org/download/stackexchange/pm.stackexchange.com.7z +https://archive.org/download/stackexchange/poker.stackexchange.com.7z +https://archive.org/download/stackexchange/politics.stackexchange.com.7z +https://archive.org/download/stackexchange/portuguese.stackexchange.com.7z +https://archive.org/download/stackexchange/proofassistants.stackexchange.com.7z +https://archive.org/download/stackexchange/pt.stackoverflow.com.7z +https://archive.org/download/stackexchange/puzzling.stackexchange.com.7z +https://archive.org/download/stackexchange/quant.stackexchange.com.7z +https://archive.org/download/stackexchange/quantumcomputing.stackexchange.com.7z +https://archive.org/download/stackexchange/raspberrypi.stackexchange.com.7z +https://archive.org/download/stackexchange/retrocomputing.stackexchange.com.7z +https://archive.org/download/stackexchange/reverseengineering.stackexchange.com.7z +https://archive.org/download/stackexchange/robotics.stackexchange.com.7z +https://archive.org/download/stackexchange/rpg.stackexchange.com.7z +https://archive.org/download/stackexchange/ru.stackoverflow.com.7z +https://archive.org/download/stackexchange/rus.stackexchange.com.7z +https://archive.org/download/stackexchange/russian.stackexchange.com.7z +https://archive.org/download/stackexchange/salesforce.stackexchange.com.7z +https://archive.org/download/stackexchange/scicomp.stackexchange.com.7z +https://archive.org/download/stackexchange/scifi.stackexchange.com.7z +https://archive.org/download/stackexchange/security.stackexchange.com.7z +https://archive.org/download/stackexchange/serverfault.com.7z +https://archive.org/download/stackexchange/sharepoint.stackexchange.com.7z +https://archive.org/download/stackexchange/sitecore.stackexchange.com.7z +https://archive.org/download/stackexchange/skeptics.stackexchange.com.7z +https://archive.org/download/stackexchange/softwareengineering.stackexchange.com.7z +https://archive.org/download/stackexchange/softwarerecs.stackexchange.com.7z +https://archive.org/download/stackexchange/solana.stackexchange.com.7z +https://archive.org/download/stackexchange/sound.stackexchange.com.7z +https://archive.org/download/stackexchange/space.stackexchange.com.7z +https://archive.org/download/stackexchange/spanish.stackexchange.com.7z +https://archive.org/download/stackexchange/sports.stackexchange.com.7z +https://archive.org/download/stackexchange/sqa.stackexchange.com.7z +https://archive.org/download/stackexchange/stackapps.com.7z +https://archive.org/download/stackexchange/stackoverflow.com-Posts.7z +https://archive.org/download/stackexchange/stats.stackexchange.com.7z +https://archive.org/download/stackexchange/stellar.stackexchange.com.7z +https://archive.org/download/stackexchange/substrate.stackexchange.com.7z +https://archive.org/download/stackexchange/superuser.com.7z +https://archive.org/download/stackexchange/sustainability.stackexchange.com.7z +https://archive.org/download/stackexchange/tex.stackexchange.com.7z +https://archive.org/download/stackexchange/tezos.stackexchange.com.7z +https://archive.org/download/stackexchange/tor.stackexchange.com.7z +https://archive.org/download/stackexchange/travel.stackexchange.com.7z +https://archive.org/download/stackexchange/tridion.stackexchange.com.7z +https://archive.org/download/stackexchange/ukrainian.stackexchange.com.7z +https://archive.org/download/stackexchange/unix.stackexchange.com.7z +https://archive.org/download/stackexchange/ux.stackexchange.com.7z +https://archive.org/download/stackexchange/vegetarianism.stackexchange.com.7z +https://archive.org/download/stackexchange/vi.stackexchange.com.7z +https://archive.org/download/stackexchange/webapps.stackexchange.com.7z +https://archive.org/download/stackexchange/webmasters.stackexchange.com.7z +https://archive.org/download/stackexchange/windowsphone.stackexchange.com.7z +https://archive.org/download/stackexchange/woodworking.stackexchange.com.7z +https://archive.org/download/stackexchange/wordpress.stackexchange.com.7z +https://archive.org/download/stackexchange/workplace.stackexchange.com.7z +https://archive.org/download/stackexchange/worldbuilding.stackexchange.com.7z +https://archive.org/download/stackexchange/writers.stackexchange.com.7z From d603b252581ffe1a12ecc8b25d9b542231908aab Mon Sep 17 00:00:00 2001 From: Rafal Wojdyla Date: Wed, 25 Mar 2026 14:41:35 -0700 Subject: [PATCH 18/56] Move upload_gcs_to_hf from datakit/download/ to utilities/ MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Not a download step — it uploads checkpoints from GCS to HuggingFace. Belongs in utilities. Updates the one consumer (exp1063_upload_tootsie). Co-Authored-By: Claude Opus 4.6 (1M context) --- experiments/tootsie/exp1063_upload_tootsie.py | 2 +- .../marin/{datakit/download => utilities}/upload_gcs_to_hf.py | 0 2 files changed, 1 insertion(+), 1 deletion(-) rename lib/marin/src/marin/{datakit/download => utilities}/upload_gcs_to_hf.py (100%) diff --git a/experiments/tootsie/exp1063_upload_tootsie.py b/experiments/tootsie/exp1063_upload_tootsie.py index d12aa5e060..c23d5de683 100644 --- a/experiments/tootsie/exp1063_upload_tootsie.py +++ b/experiments/tootsie/exp1063_upload_tootsie.py @@ -25,7 +25,7 @@ from dataclasses import dataclass, field -from marin.datakit.download.upload_gcs_to_hf import UploadConfig, upload_gcs_to_hf +from marin.utilities.upload_gcs_to_hf import UploadConfig, upload_gcs_to_hf from marin.execution.executor import ExecutorStep, executor_main diff --git a/lib/marin/src/marin/datakit/download/upload_gcs_to_hf.py b/lib/marin/src/marin/utilities/upload_gcs_to_hf.py similarity index 100% rename from lib/marin/src/marin/datakit/download/upload_gcs_to_hf.py rename to lib/marin/src/marin/utilities/upload_gcs_to_hf.py From 642b6990e4ff370e65e39f32832c4a7e259d4cd3 Mon Sep 17 00:00:00 2001 From: Rafal Wojdyla Date: Wed, 25 Mar 2026 14:45:11 -0700 Subject: [PATCH 19/56] Convert ar5iv into a package with its JSON data file alongside MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Moves ar5iv.py → ar5iv/__init__.py and places ar5iv-v04-2024.json in the same package directory. Removes the now-empty data/ directory. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../src/marin/datakit/download/{ar5iv.py => ar5iv/__init__.py} | 0 .../marin/datakit/download/{data => ar5iv}/ar5iv-v04-2024.json | 0 .../src/marin/datakit/download/{data => }/stackexchange/README.md | 0 .../download/{data => }/stackexchange/stackexchange-urls.tsv | 0 4 files changed, 0 insertions(+), 0 deletions(-) rename lib/marin/src/marin/datakit/download/{ar5iv.py => ar5iv/__init__.py} (100%) rename lib/marin/src/marin/datakit/download/{data => ar5iv}/ar5iv-v04-2024.json (100%) rename lib/marin/src/marin/datakit/download/{data => }/stackexchange/README.md (100%) rename lib/marin/src/marin/datakit/download/{data => }/stackexchange/stackexchange-urls.tsv (100%) diff --git a/lib/marin/src/marin/datakit/download/ar5iv.py b/lib/marin/src/marin/datakit/download/ar5iv/__init__.py similarity index 100% rename from lib/marin/src/marin/datakit/download/ar5iv.py rename to lib/marin/src/marin/datakit/download/ar5iv/__init__.py diff --git a/lib/marin/src/marin/datakit/download/data/ar5iv-v04-2024.json b/lib/marin/src/marin/datakit/download/ar5iv/ar5iv-v04-2024.json similarity index 100% rename from lib/marin/src/marin/datakit/download/data/ar5iv-v04-2024.json rename to lib/marin/src/marin/datakit/download/ar5iv/ar5iv-v04-2024.json diff --git a/lib/marin/src/marin/datakit/download/data/stackexchange/README.md b/lib/marin/src/marin/datakit/download/stackexchange/README.md similarity index 100% rename from lib/marin/src/marin/datakit/download/data/stackexchange/README.md rename to lib/marin/src/marin/datakit/download/stackexchange/README.md diff --git a/lib/marin/src/marin/datakit/download/data/stackexchange/stackexchange-urls.tsv b/lib/marin/src/marin/datakit/download/stackexchange/stackexchange-urls.tsv similarity index 100% rename from lib/marin/src/marin/datakit/download/data/stackexchange/stackexchange-urls.tsv rename to lib/marin/src/marin/datakit/download/stackexchange/stackexchange-urls.tsv From 92420925f398704fd0425d0bc53375a0af61397f Mon Sep 17 00:00:00 2001 From: Rafal Wojdyla Date: Wed, 25 Mar 2026 14:45:31 -0700 Subject: [PATCH 20/56] Delete unused filesystem transfer module transfer_step and transfer_files have zero consumers in the codebase. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../src/marin/datakit/download/filesystem.py | 81 ------------------- 1 file changed, 81 deletions(-) delete mode 100644 lib/marin/src/marin/datakit/download/filesystem.py diff --git a/lib/marin/src/marin/datakit/download/filesystem.py b/lib/marin/src/marin/datakit/download/filesystem.py deleted file mode 100644 index 7ace48ab38..0000000000 --- a/lib/marin/src/marin/datakit/download/filesystem.py +++ /dev/null @@ -1,81 +0,0 @@ -# Copyright The Marin Authors -# SPDX-License-Identifier: Apache-2.0 - -import logging -import os -import random -import time - -from iris.marin_fs import url_to_fs -from marin.execution.step_spec import StepSpec -from zephyr import Dataset, ZephyrContext - -from marin.utils import fsspec_exists, fsspec_glob - -logger = logging.getLogger(__name__) - - -def transfer_files( - input_path: str, - output_path: str, - *, - num_random_files: int | None = None, - filetype: str = "jsonl.zst", -) -> None: - """Transfer files from input_path to output_path. - - When num_random_files is None, copies all matching files. - When specified, randomly samples that many files. - """ - input_path = input_path.rstrip("/") - - logger.info("Transferring %s to %s", input_path, output_path) - start_time = time.time() - fs, _ = url_to_fs(input_path) - if not fs.exists(input_path): - raise FileNotFoundError(f"{input_path} does not exist.") - - filenames = fsspec_glob(os.path.join(input_path, f"**/*.{filetype}")) - - if num_random_files is not None: - random.seed(42) - random.shuffle(filenames) - filenames = filenames[:num_random_files] - - def copy_file(filename: str) -> None: - output_filename = os.path.join(output_path, os.path.basename(filename)) - if not fsspec_exists(output_filename): - fs.makedirs(output_path, exist_ok=True) - fs.copy(filename, output_filename) - - pipeline = Dataset.from_list(filenames).map(copy_file) - ctx = ZephyrContext(name="fs-transfer") - ctx.execute(pipeline) - - elapsed = time.time() - start_time - logger.info("Transferred %s to %s (%.1fs)", input_path, output_path, elapsed) - - -def transfer_step( - name: str, - *, - input_path: str, - num_random_files: int | None = None, - filetype: str = "jsonl.zst", - deps: list[StepSpec] | None = None, - output_path_prefix: str | None = None, - override_output_path: str | None = None, -) -> StepSpec: - """Create a StepSpec that transfers files between fsspec paths.""" - - def _run(output_path: str) -> None: - transfer_files(input_path, output_path, num_random_files=num_random_files, filetype=filetype) - - return StepSpec( - name=name, - fn=_run, - deps=deps or [], - hash_attrs={"input_path": input_path, "num_random_files": num_random_files, "filetype": filetype}, - output_path_prefix=output_path_prefix, - override_output_path=override_output_path, - ) From cd65de58d8d0dcf0ee56ecf16b6981a257fd9d95 Mon Sep 17 00:00:00 2001 From: Rafal Wojdyla Date: Wed, 25 Mar 2026 14:47:21 -0700 Subject: [PATCH 21/56] Move ar5iv logic from __init__.py to download.py within the package Keeps __init__.py as a thin re-export layer, with the actual implementation in ar5iv/download.py alongside the JSON data file. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../marin/datakit/download/ar5iv/__init__.py | 161 +----------------- .../marin/datakit/download/ar5iv/download.py | 160 +++++++++++++++++ 2 files changed, 164 insertions(+), 157 deletions(-) create mode 100644 lib/marin/src/marin/datakit/download/ar5iv/download.py diff --git a/lib/marin/src/marin/datakit/download/ar5iv/__init__.py b/lib/marin/src/marin/datakit/download/ar5iv/__init__.py index 86498e12e1..5d820ef55f 100644 --- a/lib/marin/src/marin/datakit/download/ar5iv/__init__.py +++ b/lib/marin/src/marin/datakit/download/ar5iv/__init__.py @@ -1,160 +1,7 @@ # Copyright The Marin Authors # SPDX-License-Identifier: Apache-2.0 -""" -Download and process Ar5iv dataset from a zip file. - -Example Usage: -uv run zephyr --backend=ray --max-parallelism=1000 --memory=10GB \ - lib/marin/src/marin/download/ar5iv/download.py \ - --input_path gs://bucket/ar5iv.zip \ - --output_path gs://bucket/output -""" - -import json -import logging -import zipfile -from collections import defaultdict -from dataclasses import dataclass - -import draccus -from iris.marin_fs import open_url -from marin.execution.step_spec import StepSpec -from zephyr import Dataset, ZephyrContext -from zephyr.writers import atomic_rename -from iris.logging import configure_logging - -logger = logging.getLogger(__name__) - - -@dataclass -class Ar5ivDownloadConfig: - input_path: str - output_path: str - max_files: int | None = None # Maximum number of shards to process - - -def process_shard(shard_task: dict) -> dict: - """ - Process a single shard by extracting its files from the zip in GCS and uploading the merged JSONL. - - Args: - shard_task: Dict with keys 'input_path', 'output_path', 'shard_id', 'file_list' - """ - input_path = shard_task["input_path"] - output_path = shard_task["output_path"] - shard_id = shard_task["shard_id"] - file_list = shard_task["file_list"] - gcs_path = f"{output_path}/{shard_id}.jsonl.gz" - - with open_url(str(input_path), "rb") as f: - with zipfile.ZipFile(f) as zf: - with atomic_rename(gcs_path) as temp_path, open_url(temp_path, "wt", compression="gzip") as out_f: - for filename in file_list: - with zf.open(filename, "r") as file_handle: - content = file_handle.read() - record = { - "filename": filename, - "format": "html", - "content": content.decode("utf-8", errors="replace"), - } - print(json.dumps(record), file=out_f) - - logger.info(f"Shard {shard_id} with {len(file_list)} files uploaded to {gcs_path}") - return {"shard_id": shard_id, "num_files": len(file_list), "output_path": gcs_path} - - -def download(cfg: Ar5ivDownloadConfig) -> None: - """ - Download and process Ar5iv dataset from a zip file in GCS. - - This function can be called by the executor framework or used standalone. - """ - logger.info("Starting transfer of Ar5iv dataset...") - logger.info(f"Source: {cfg.input_path}") - - # Use fsspec+zipfile to list all files - with open_url(str(cfg.input_path), "rb") as f: - with zipfile.ZipFile(f) as zf: - all_files = zf.infolist() - - # Group by shard directory - # We assume structure: something like: shard_id/.../file - # shard_id is derived from the second last component if files are nested. - # Adjust as needed if directory structure differs. - shard_dict = defaultdict(list) - for info in all_files: - if info.is_dir(): - continue - # E.g. path might look like: "003/something.html" - # Extract shard_id from the directory: - # Split by "/" and take the first part if we assume structure {shard_id}/file - parts = info.filename.strip("/").split("/") - if len(parts) < 2: - # File at root level - decide how to handle this case. - # If no directory structure is given, skip or treat differently. - continue - shard_id = parts[-2] # get the second-last directory as shard_id - shard_dict[shard_id].append(info.filename) - - # Apply max_files limit if provided - shard_ids = list(shard_dict.keys()) - if cfg.max_files is not None: - shard_ids = shard_ids[: cfg.max_files] - - logger.info(f"Found {len(shard_ids)} shards to process.") - - # Build task list for each shard - shard_tasks = [] - for shard_id in shard_ids: - shard_tasks.append( - { - "input_path": cfg.input_path, - "output_path": cfg.output_path, - "shard_id": shard_id, - "file_list": shard_dict[shard_id], - } - ) - - # Execute pipeline with zephyr - pipeline = ( - Dataset.from_list(shard_tasks) - .map(process_shard) - .write_jsonl(f"{cfg.output_path}/.metrics/part-{{shard:05d}}.jsonl", skip_existing=True) - ) - ctx = ZephyrContext(name="download-ar5iv") - ctx.execute(pipeline) - - logger.info("Transfer completed successfully!") - - -def ar5iv_step( - name: str = "raw/ar5iv", - *, - input_path: str, - max_files: int | None = None, - deps: list[StepSpec] | None = None, - output_path_prefix: str | None = None, - override_output_path: str | None = None, -) -> StepSpec: - """Create a StepSpec that downloads and processes the Ar5iv dataset from a zip file.""" - - def _run(output_path: str) -> None: - download(Ar5ivDownloadConfig(input_path=input_path, output_path=output_path, max_files=max_files)) - - return StepSpec( - name=name, - fn=_run, - deps=deps or [], - hash_attrs={"input_path": input_path, "max_files": max_files}, - output_path_prefix=output_path_prefix, - override_output_path=override_output_path, - ) - - -@draccus.wrap() -def main(cfg: Ar5ivDownloadConfig) -> None: - """CLI entrypoint for downloading and processing Ar5iv dataset.""" - - configure_logging(level=logging.INFO) - download(cfg) +from marin.datakit.download.ar5iv.download import Ar5ivDownloadConfig as Ar5ivDownloadConfig +from marin.datakit.download.ar5iv.download import ar5iv_step as ar5iv_step +from marin.datakit.download.ar5iv.download import download as download +from marin.datakit.download.ar5iv.download import process_shard as process_shard diff --git a/lib/marin/src/marin/datakit/download/ar5iv/download.py b/lib/marin/src/marin/datakit/download/ar5iv/download.py new file mode 100644 index 0000000000..86498e12e1 --- /dev/null +++ b/lib/marin/src/marin/datakit/download/ar5iv/download.py @@ -0,0 +1,160 @@ +# Copyright The Marin Authors +# SPDX-License-Identifier: Apache-2.0 + +""" +Download and process Ar5iv dataset from a zip file. + +Example Usage: +uv run zephyr --backend=ray --max-parallelism=1000 --memory=10GB \ + lib/marin/src/marin/download/ar5iv/download.py \ + --input_path gs://bucket/ar5iv.zip \ + --output_path gs://bucket/output +""" + +import json +import logging +import zipfile +from collections import defaultdict +from dataclasses import dataclass + +import draccus +from iris.marin_fs import open_url +from marin.execution.step_spec import StepSpec +from zephyr import Dataset, ZephyrContext +from zephyr.writers import atomic_rename +from iris.logging import configure_logging + +logger = logging.getLogger(__name__) + + +@dataclass +class Ar5ivDownloadConfig: + input_path: str + output_path: str + max_files: int | None = None # Maximum number of shards to process + + +def process_shard(shard_task: dict) -> dict: + """ + Process a single shard by extracting its files from the zip in GCS and uploading the merged JSONL. + + Args: + shard_task: Dict with keys 'input_path', 'output_path', 'shard_id', 'file_list' + """ + input_path = shard_task["input_path"] + output_path = shard_task["output_path"] + shard_id = shard_task["shard_id"] + file_list = shard_task["file_list"] + gcs_path = f"{output_path}/{shard_id}.jsonl.gz" + + with open_url(str(input_path), "rb") as f: + with zipfile.ZipFile(f) as zf: + with atomic_rename(gcs_path) as temp_path, open_url(temp_path, "wt", compression="gzip") as out_f: + for filename in file_list: + with zf.open(filename, "r") as file_handle: + content = file_handle.read() + record = { + "filename": filename, + "format": "html", + "content": content.decode("utf-8", errors="replace"), + } + print(json.dumps(record), file=out_f) + + logger.info(f"Shard {shard_id} with {len(file_list)} files uploaded to {gcs_path}") + return {"shard_id": shard_id, "num_files": len(file_list), "output_path": gcs_path} + + +def download(cfg: Ar5ivDownloadConfig) -> None: + """ + Download and process Ar5iv dataset from a zip file in GCS. + + This function can be called by the executor framework or used standalone. + """ + logger.info("Starting transfer of Ar5iv dataset...") + logger.info(f"Source: {cfg.input_path}") + + # Use fsspec+zipfile to list all files + with open_url(str(cfg.input_path), "rb") as f: + with zipfile.ZipFile(f) as zf: + all_files = zf.infolist() + + # Group by shard directory + # We assume structure: something like: shard_id/.../file + # shard_id is derived from the second last component if files are nested. + # Adjust as needed if directory structure differs. + shard_dict = defaultdict(list) + for info in all_files: + if info.is_dir(): + continue + # E.g. path might look like: "003/something.html" + # Extract shard_id from the directory: + # Split by "/" and take the first part if we assume structure {shard_id}/file + parts = info.filename.strip("/").split("/") + if len(parts) < 2: + # File at root level - decide how to handle this case. + # If no directory structure is given, skip or treat differently. + continue + shard_id = parts[-2] # get the second-last directory as shard_id + shard_dict[shard_id].append(info.filename) + + # Apply max_files limit if provided + shard_ids = list(shard_dict.keys()) + if cfg.max_files is not None: + shard_ids = shard_ids[: cfg.max_files] + + logger.info(f"Found {len(shard_ids)} shards to process.") + + # Build task list for each shard + shard_tasks = [] + for shard_id in shard_ids: + shard_tasks.append( + { + "input_path": cfg.input_path, + "output_path": cfg.output_path, + "shard_id": shard_id, + "file_list": shard_dict[shard_id], + } + ) + + # Execute pipeline with zephyr + pipeline = ( + Dataset.from_list(shard_tasks) + .map(process_shard) + .write_jsonl(f"{cfg.output_path}/.metrics/part-{{shard:05d}}.jsonl", skip_existing=True) + ) + ctx = ZephyrContext(name="download-ar5iv") + ctx.execute(pipeline) + + logger.info("Transfer completed successfully!") + + +def ar5iv_step( + name: str = "raw/ar5iv", + *, + input_path: str, + max_files: int | None = None, + deps: list[StepSpec] | None = None, + output_path_prefix: str | None = None, + override_output_path: str | None = None, +) -> StepSpec: + """Create a StepSpec that downloads and processes the Ar5iv dataset from a zip file.""" + + def _run(output_path: str) -> None: + download(Ar5ivDownloadConfig(input_path=input_path, output_path=output_path, max_files=max_files)) + + return StepSpec( + name=name, + fn=_run, + deps=deps or [], + hash_attrs={"input_path": input_path, "max_files": max_files}, + output_path_prefix=output_path_prefix, + override_output_path=override_output_path, + ) + + +@draccus.wrap() +def main(cfg: Ar5ivDownloadConfig) -> None: + """CLI entrypoint for downloading and processing Ar5iv dataset.""" + + configure_logging(level=logging.INFO) + download(cfg) From 7601f45cf1d6340e2abaa859db2dd018e34e929c Mon Sep 17 00:00:00 2001 From: Rafal Wojdyla Date: Wed, 25 Mar 2026 14:49:59 -0700 Subject: [PATCH 22/56] Delete unused stream_remove_columns module and its test Zero production consumers. The test_prune_hf_dataset test only exercised the deleted module. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../datakit/download/stream_remove_columns.py | 100 ------------------ tests/download/test_huggingface.py | 49 --------- 2 files changed, 149 deletions(-) delete mode 100644 lib/marin/src/marin/datakit/download/stream_remove_columns.py diff --git a/lib/marin/src/marin/datakit/download/stream_remove_columns.py b/lib/marin/src/marin/datakit/download/stream_remove_columns.py deleted file mode 100644 index ba883ee944..0000000000 --- a/lib/marin/src/marin/datakit/download/stream_remove_columns.py +++ /dev/null @@ -1,100 +0,0 @@ -# Copyright The Marin Authors -# SPDX-License-Identifier: Apache-2.0 - -"""Remove unnecessary columns while streaming data from huggingface.""" - -import logging -import os -from dataclasses import dataclass - -import pandas as pd -import pyarrow.parquet as pq -from huggingface_hub import HfFileSystem -from tqdm import tqdm -from zephyr import Dataset, ZephyrContext - -logger = logging.getLogger(__name__) - - -def prune_stream_and_save(input_file: str, output_file: str, keep_columns: list[str]): - """ - Prunes and saves a parquet file by removing un-specified columns. - - Reads the input parquet file in batches, removes columns not in keep_columns, - and writes the result to output_file. Processing in batches avoids memory issues. - - Args: - input_file (str): Path to input parquet file on HuggingFace - output_file (str): Path where pruned parquet file will be saved - keep_columns (list[str]): List of column names to retain - """ - parquet_file = pq.ParquetFile(HfFileSystem().open(input_file)) - - full_df_list = [] - for batch in tqdm(parquet_file.iter_batches(batch_size=10000), desc=f"Processing {input_file}"): - df = batch.to_pandas() - - drop_columns = [col for col in df.columns if col not in keep_columns] - df = df.drop(columns=drop_columns) - - full_df_list.append(df) - - full_df = pd.concat(full_df_list) - logger.info(f"Saving pruned dataset of shape {full_df.shape} to {output_file}") - full_df.to_parquet(output_file, index=False) - - -def get_file_tasks(hf_path: str, output_path: str, keep_columns: list[str]): - """ - Generate file processing tasks for a HuggingFace subset. - - Args: - hf_path (str): The HuggingFace dataset path to load - output_path (str): The output path to save the pruned dataset - keep_columns (list[str]): The columns to keep in the pruned dataset - - Yields: - Dict with input_file, output_file, and keep_columns for each parquet file - """ - logger.info(f"Loading dataset from {hf_path}") - parquet_list = HfFileSystem().glob(f"{hf_path}/*.parquet") - - for file in parquet_list: - output_file = os.path.join(output_path, os.path.basename(file)) - yield {"input_file": file, "output_file": output_file, "keep_columns": keep_columns} - - -@dataclass -class DatasetConfig: - hf_repo_id: str - hf_revision: str - hf_paths: list[str] - output_path: str - keep_columns: list[str] - - -def prune_hf_dataset(cfg: DatasetConfig): - logger.info(f"Starting dataset pruning for {cfg.hf_paths}") - - # Build list of subset paths to process - subset_tasks = [] - for path in cfg.hf_paths: - # HF Path form: hf://[][@]/ - hf_path = f"hf://datasets/{cfg.hf_repo_id}@{cfg.hf_revision}/{path}" - logger.info(f"Processing subset {hf_path}") - output_path = os.path.join(cfg.output_path, path) - subset_tasks.append({"hf_path": hf_path, "output_path": output_path}) - - # Build pipeline with nested parallelism: - # - Outer level: process subsets (MAX_CONCURRENT_WORKERS=1) - # - Inner level: process files within each subset - pipeline = ( - Dataset.from_list(subset_tasks) - .flat_map(lambda task: get_file_tasks(task["hf_path"], task["output_path"], cfg.keep_columns)) - .map(lambda task: prune_stream_and_save(task["input_file"], task["output_file"], cfg.keep_columns)) - ) - - logger.info("Executing pipeline") - ctx = ZephyrContext(name="hf-remove-columns") - ctx.execute(pipeline) - logger.info("Successfully processed all subsets") diff --git a/tests/download/test_huggingface.py b/tests/download/test_huggingface.py index 4d16eadf6b..f055cc94ca 100644 --- a/tests/download/test_huggingface.py +++ b/tests/download/test_huggingface.py @@ -7,7 +7,6 @@ import json from unittest.mock import MagicMock, Mock, patch -import pandas as pd import pytest from marin.datakit.download.huggingface import ( @@ -16,10 +15,6 @@ download_hf, stream_file_to_fsspec, ) -from marin.datakit.download.stream_remove_columns import ( - DatasetConfig, - prune_hf_dataset, -) @pytest.fixture @@ -155,50 +150,6 @@ def test_download_hf_bucket_requires_newer_huggingface_hub(tmp_path): download_hf(cfg) -def test_prune_hf_dataset(tmp_path): - """Test full dataset pruning pipeline.""" - # Create test parquet data - test_data = pd.DataFrame( - { - "id": [1, 2], - "text": ["hello", "world"], - "unwanted": ["a", "b"], - } - ) - - # Create multiple buffers since each call needs a fresh one - def create_buffer(): - buffer = io.BytesIO() - test_data.to_parquet(buffer, index=False) - buffer.seek(0) - return buffer - - cfg = DatasetConfig( - hf_repo_id="test-org/test-dataset", - hf_revision="main", - hf_paths=["data"], - output_path=str(tmp_path / "output"), - keep_columns=["id", "text"], - ) - - # Create output directory structure - output_dir = tmp_path / "output" / "data" - output_dir.mkdir(parents=True) - - mock_fs = MagicMock() - mock_fs.glob = Mock(return_value=["hf://datasets/test-org/test-dataset@main/data/file.parquet"]) - mock_fs.open = Mock(side_effect=lambda path, mode="rb": create_buffer()) - - with patch("marin.datakit.download.stream_remove_columns.HfFileSystem", return_value=mock_fs): - prune_hf_dataset(cfg) - - # Verify output - output_file = tmp_path / "output" / "data" / "file.parquet" - assert output_file.exists() - result_df = pd.read_parquet(output_file) - assert list(result_df.columns) == ["id", "text"] - - def test_stream_file_to_fsspec_retries_on_timeout(tmp_path): """A socket timeout while reading should trigger retry and then succeed.""" file_path = "datasets/test-org/test-dataset/data/file1.txt" From 64f5c48c365dcd37954b7b592e92a3d314f5c821 Mon Sep 17 00:00:00 2001 From: Rafal Wojdyla Date: Wed, 25 Mar 2026 14:51:23 -0700 Subject: [PATCH 23/56] Remove unused dclm_hq_step function MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit No consumers — extract_dclm_hq_dump is called directly by transform_dclm_hq, not as a standalone download step. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../src/marin/datakit/download/dclm_hq.py | 24 ------------------- 1 file changed, 24 deletions(-) diff --git a/lib/marin/src/marin/datakit/download/dclm_hq.py b/lib/marin/src/marin/datakit/download/dclm_hq.py index a4301245aa..b473768ec0 100644 --- a/lib/marin/src/marin/datakit/download/dclm_hq.py +++ b/lib/marin/src/marin/datakit/download/dclm_hq.py @@ -23,7 +23,6 @@ import requests from iris.marin_fs import open_url -from marin.execution.step_spec import StepSpec import warcio from marin.utils import fsspec_glob from tqdm import tqdm @@ -193,26 +192,3 @@ def extract_dclm_hq_dump(input_path: str, output_path: str) -> None: ctx.execute(pipeline) logger.info("Processing completed successfully!") - - -def dclm_hq_step( - name: str = "raw/dclm-hq-html", - *, - input_path: str, - deps: list[StepSpec] | None = None, - output_path_prefix: str | None = None, - override_output_path: str | None = None, -) -> StepSpec: - """Create a StepSpec that downloads DCLM HQ HTML data from Common Crawl.""" - - def _run(output_path: str) -> None: - extract_dclm_hq_dump(input_path, output_path) - - return StepSpec( - name=name, - fn=_run, - deps=deps or [], - hash_attrs={"input_path": input_path}, - output_path_prefix=output_path_prefix, - override_output_path=override_output_path, - ) From 76e3336775dfa26d40749f0596575bfca79906e7 Mon Sep 17 00:00:00 2001 From: Rafal Wojdyla Date: Wed, 25 Mar 2026 15:08:32 -0700 Subject: [PATCH 24/56] Simplify nemotron_cc_step to download_nemotron_cc_step MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remove unnecessary parameters (deps, output_path_prefix, override_output_path) from the step function — this download takes no configuration. Rename for consistency. Co-Authored-By: Claude Opus 4.6 (1M context) --- experiments/pretraining_datasets/nemotron.py | 4 +-- .../src/marin/datakit/download/nemotron_cc.py | 26 +++---------------- 2 files changed, 5 insertions(+), 25 deletions(-) diff --git a/experiments/pretraining_datasets/nemotron.py b/experiments/pretraining_datasets/nemotron.py index 4c463d8e4f..7a5b50afb0 100644 --- a/experiments/pretraining_datasets/nemotron.py +++ b/experiments/pretraining_datasets/nemotron.py @@ -8,14 +8,14 @@ from experiments.defaults import DEFAULT_NEW_RUN_DATA_SHUFFLE from experiments.pretraining_datasets.dclm import dclm_components_llama3 -from marin.datakit.download.nemotron_cc import nemotron_cc_step +from marin.datakit.download.nemotron_cc import download_nemotron_cc_step from marin.execution.executor import ExecutorStep, output_path_of, this_output_path, versioned from marin.processing.tokenize import TokenizeConfig, lm_mixture_data_config, tokenize from marin.processing.tokenize.data_configs import TokenizerStep # Raw dataset download step downloads = { - "nemotron_cc": nemotron_cc_step("raw/nemotro-cc").as_executor_step(), + "nemotron_cc": download_nemotron_cc_step("raw/nemotro-cc").as_executor_step(), } _nemotron_cc_path = output_path_of(downloads["nemotron_cc"], "contrib/Nemotron/Nemotron-CC/data-jsonl/") diff --git a/lib/marin/src/marin/datakit/download/nemotron_cc.py b/lib/marin/src/marin/datakit/download/nemotron_cc.py index 0e65f307b9..8ba11e95b1 100644 --- a/lib/marin/src/marin/datakit/download/nemotron_cc.py +++ b/lib/marin/src/marin/datakit/download/nemotron_cc.py @@ -1,14 +1,7 @@ # Copyright The Marin Authors # SPDX-License-Identifier: Apache-2.0 -""" -Download and process Nemotron-CC dataset from Common Crawl. - -Example Usage: -uv run zephyr --backend=ray --max-parallelism=100 --memory=4GB \ - lib/marin/src/marin/download/nemotron_cc/download_nemotron_cc.py \ - --output_path gs://bucket/nemotron-output -""" +"""Download and process Nemotron-CC dataset from Common Crawl""" import json import logging @@ -115,23 +108,10 @@ def download_nemotron_cc(output_path: str) -> None: logger.info(f"Downloaded Nemotron CC files to {output_path}") -def nemotron_cc_step( - name: str = "raw/nemotron-cc", - *, - deps: list[StepSpec] | None = None, - output_path_prefix: str | None = None, - override_output_path: str | None = None, -) -> StepSpec: +def download_nemotron_cc_step(name: str = "raw/nemotron-cc") -> StepSpec: """Create a StepSpec that downloads the Nemotron-CC dataset from Common Crawl.""" - def _run(output_path: str) -> None: - download_nemotron_cc(output_path) - return StepSpec( name=name, - fn=_run, - deps=deps or [], - hash_attrs={}, - output_path_prefix=output_path_prefix, - override_output_path=override_output_path, + fn=lambda output_path: download_nemotron_cc(output_path=output_path), ) From 90b9b65ef5555c3bf133c8aa613870e458aafc73 Mon Sep 17 00:00:00 2001 From: Rafal Wojdyla Date: Wed, 25 Mar 2026 15:12:34 -0700 Subject: [PATCH 25/56] Rename nemotron_cc.py to nemotron_v1.py Prepares for adding nemotron_v2 download module alongside. Co-Authored-By: Claude Opus 4.6 (1M context) --- experiments/pretraining_datasets/nemotron.py | 2 +- .../datakit/download/{nemotron_cc.py => nemotron_v1.py} | 0 tests/download/test_nemotron_cc.py | 6 +++--- 3 files changed, 4 insertions(+), 4 deletions(-) rename lib/marin/src/marin/datakit/download/{nemotron_cc.py => nemotron_v1.py} (100%) diff --git a/experiments/pretraining_datasets/nemotron.py b/experiments/pretraining_datasets/nemotron.py index 7a5b50afb0..35d3d86e2b 100644 --- a/experiments/pretraining_datasets/nemotron.py +++ b/experiments/pretraining_datasets/nemotron.py @@ -8,7 +8,7 @@ from experiments.defaults import DEFAULT_NEW_RUN_DATA_SHUFFLE from experiments.pretraining_datasets.dclm import dclm_components_llama3 -from marin.datakit.download.nemotron_cc import download_nemotron_cc_step +from marin.datakit.download.nemotron_v1 import download_nemotron_cc_step from marin.execution.executor import ExecutorStep, output_path_of, this_output_path, versioned from marin.processing.tokenize import TokenizeConfig, lm_mixture_data_config, tokenize from marin.processing.tokenize.data_configs import TokenizerStep diff --git a/lib/marin/src/marin/datakit/download/nemotron_cc.py b/lib/marin/src/marin/datakit/download/nemotron_v1.py similarity index 100% rename from lib/marin/src/marin/datakit/download/nemotron_cc.py rename to lib/marin/src/marin/datakit/download/nemotron_v1.py diff --git a/tests/download/test_nemotron_cc.py b/tests/download/test_nemotron_cc.py index e4e89e361a..e8ed0e2de1 100644 --- a/tests/download/test_nemotron_cc.py +++ b/tests/download/test_nemotron_cc.py @@ -9,10 +9,10 @@ import pytest import zstandard as zstd from iris.marin_fs import open_url as _real_open_url -from marin.datakit.download.nemotron_cc import download_nemotron_cc +from marin.datakit.download.nemotron_v1 import download_nemotron_cc -_OPEN_URL_TARGET = "marin.datakit.download.nemotron_cc.open_url" -_REQUESTS_SESSION_TARGET = "marin.datakit.download.nemotron_cc.requests.Session" +_OPEN_URL_TARGET = "marin.datakit.download.nemotron_v1.open_url" +_REQUESTS_SESSION_TARGET = "marin.datakit.download.nemotron_v1.requests.Session" SAMPLE_NEMOTRON_RECORDS = [ { From 1760f2f089eb9221551113a848b7777f9242cd03 Mon Sep 17 00:00:00 2001 From: Rafal Wojdyla Date: Wed, 25 Mar 2026 15:14:39 -0700 Subject: [PATCH 26/56] Extract nemotron_v2 download definitions into datakit/download/nemotron_v2.py Moves NEMOTRON_V2_DATASETS and nemotron_v2_download_step() from experiments/pretraining_datasets/nemotron_v2.py into a datakit module. Replaces the raw dict with a NemotronV2Dataset dataclass. The experiment file now imports definitions and only wires tokenization. Co-Authored-By: Claude Opus 4.6 (1M context) --- experiments/pretraining_datasets/__init__.py | 2 +- .../pretraining_datasets/nemotron_v2.py | 124 ++---------------- .../src/marin/datakit/download/nemotron_v2.py | 122 +++++++++++++++++ 3 files changed, 131 insertions(+), 117 deletions(-) create mode 100644 lib/marin/src/marin/datakit/download/nemotron_v2.py diff --git a/experiments/pretraining_datasets/__init__.py b/experiments/pretraining_datasets/__init__.py index 090a298498..6ca2bff80f 100644 --- a/experiments/pretraining_datasets/__init__.py +++ b/experiments/pretraining_datasets/__init__.py @@ -130,7 +130,7 @@ # Nemotron v2 datasets (from nvidia/Nemotron-Pre-Training-Datasets collection) **{ family: { - "subsets": list(info["subsets"].keys()), + "subsets": list(info.subsets.keys()), "download": nemotron_v2_downloads[family], "tokenize_fn": lambda f=family: tokenize_nemotron_v2_family(f), } diff --git a/experiments/pretraining_datasets/nemotron_v2.py b/experiments/pretraining_datasets/nemotron_v2.py index ccb79f9e14..b3cd1d6760 100644 --- a/experiments/pretraining_datasets/nemotron_v2.py +++ b/experiments/pretraining_datasets/nemotron_v2.py @@ -2,134 +2,26 @@ # SPDX-License-Identifier: Apache-2.0 """ -Nemotron v2 pre-training dataset definitions and tokenization. +Nemotron v2 pre-training dataset tokenization. -These datasets come from the nvidia/Nemotron-Pre-Training-Datasets collection -on HuggingFace. They are additive to the original Nemotron-CC (v1) dataset -defined in nemotron.py. - -Most of these datasets are gated and require HF_TOKEN at download time. -All use parquet format with a "text" field. +Download definitions live in marin.datakit.download.nemotron_v2. +This file wires them into tokenization steps for experiment pipelines. """ import os.path -from marin.datakit.download.huggingface import DownloadConfig, download_hf +from marin.datakit.download.nemotron_v2 import NEMOTRON_V2_DATASETS, nemotron_v2_download_step from marin.execution.executor import ExecutorStep, this_output_path, versioned from marin.processing.tokenize import TokenizeConfig, tokenize from marin.processing.tokenize.data_configs import TokenizerStep -# ============================================================================ -# DATASET DEFINITIONS -# ============================================================================ - -# Each entry: (hf_id, revision, subsets_dict) -# subsets_dict maps subset_name -> glob pattern for parquet files within the download - -NEMOTRON_V2_DATASETS = { - "nemotron_cc_v2": { - "hf_dataset_id": "nvidia/Nemotron-CC-v2", - "revision": "229a2e7", - "subsets": { - "diverse_qa": "Diverse-QA/**/*.parquet", - "high_quality": "High-Quality/**/*.parquet", - "high_quality_synthetic": "High-Quality-Synthetic/**/*.parquet", - "medium_high_quality": "Medium-High-Quality/**/*.parquet", - "medium_quality": "Medium-Quality/**/*.parquet", - "translated_diverse_qa": "Translated-Diverse-QA/**/*.parquet", - }, - }, - "nemotron_cc_v2_1": { - "hf_dataset_id": "nvidia/Nemotron-CC-v2.1", - "revision": "ba6f2aa", - "subsets": { - "high_quality": "High-Quality/**/*.parquet", - "high_quality_dqa": "High-Quality-DQA/**/*.parquet", - "high_quality_synthetic": "High-Quality-Synthetic/**/*.parquet", - "high_quality_translated": "High-Quality-Translated-To-English/**/*.parquet", - "high_quality_translated_synthetic": "High-Quality-Translated-To-English-Synthetic/**/*.parquet", - "medium_high_quality": "Medium-High-Quality/**/*.parquet", - "medium_high_quality_synthetic": "Medium-High-Quality-Synthetic/**/*.parquet", - "medium_high_quality_translated": "Medium-High-Quality-Translated-To-English/**/*.parquet", - "medium_quality": "Medium-Quality/**/*.parquet", - }, - }, - "nemotron_cc_code_v1": { - "hf_dataset_id": "nvidia/Nemotron-CC-Code-v1", - "revision": "5c5bebc", - "subsets": { - "all": "data/**/*.parquet", - }, - }, - "nemotron_cc_math_v1": { - "hf_dataset_id": "nvidia/Nemotron-CC-Math-v1", - "revision": "397a250", - "subsets": { - "3": "3/**/*.parquet", - "4plus": "4plus/**/*.parquet", - "4plus_mind": "4plus_MIND/**/*.parquet", - }, - }, - "nemotron_pretraining_code_v1": { - "hf_dataset_id": "nvidia/Nemotron-Pretraining-Code-v1", - "revision": "01393d3", - "subsets": { - "synthetic_code": "Synthetic-Code/**/*.parquet", - "code_metadata": "Nemotron-Code-Metadata/**/*.parquet", - }, - }, - "nemotron_pretraining_code_v2": { - "hf_dataset_id": "nvidia/Nemotron-Pretraining-Code-v2", - "revision": "7b1a453", - "subsets": { - "code_metadata": "Nemotron-Code-Metadata/**/*.parquet", - "synthetic_question_answering": "Synthetic-Question-Answering/**/*.parquet", - "synthetic_student_teacher": "Synthetic-Student-Teacher/**/*.parquet", - "synthetic_code_review": "Synthetic-Code-Review/**/*.parquet", - "synthetic_rewriting": "Synthetic-Rewriting/**/*.parquet", - "synthetic_transpilation": "Synthetic-Transpilation/**/*.parquet", - }, - }, - "nemotron_pretraining_specialized_v1": { - "hf_dataset_id": "nvidia/Nemotron-Pretraining-Specialized-v1", - "revision": "9ed3718", - "subsets": { - "wiki_rewrite": "Nemotron-Pretraining-Wiki-Rewrite/**/*.parquet", - "math_textbooks": "Nemotron-Pretraining-Math-Textbooks/**/*.parquet", - "stem_sft": "Nemotron-Pretraining-STEM-SFT/**/*.parquet", - "scientific_coding": "Nemotron-Pretraining-Scientific-Coding/**/*.parquet", - "rqa": "Nemotron-Pretraining-RQA/**/*.parquet", - "infinibyte_reasoning": "Nemotron-Pretraining-InfiniByte-Reasoning/**/*.parquet", - }, - }, - "nemotron_pretraining_sft_v1": { - "hf_dataset_id": "nvidia/Nemotron-Pretraining-SFT-v1", - "revision": "3f1a5b8", - "subsets": { - "sft_code": "Nemotron-SFT-Code/**/*.parquet", - "sft_general": "Nemotron-SFT-General/**/*.parquet", - "sft_math": "Nemotron-SFT-MATH/**/*.parquet", - }, - }, -} - - # ============================================================================ # RAW DATASET DOWNLOADS # ============================================================================ -downloads: dict[str, ExecutorStep] = {} -for _family, _info in NEMOTRON_V2_DATASETS.items(): - downloads[_family] = ExecutorStep( - name=f"raw/{_family}", - fn=download_hf, - config=DownloadConfig( - hf_dataset_id=_info["hf_dataset_id"], - revision=versioned(_info["revision"]), - gcs_output_path=this_output_path(), - wait_for_completion=True, - ), - ) +downloads: dict[str, ExecutorStep] = { + family: nemotron_v2_download_step(family).as_executor_step() for family in NEMOTRON_V2_DATASETS +} # ============================================================================ @@ -152,7 +44,7 @@ def tokenize_nemotron_v2_family( download_step = downloads[family] steps: dict[str, ExecutorStep[TokenizeConfig]] = {} - for subset, glob_pattern in info["subsets"].items(): + for subset, glob_pattern in info.subsets.items(): output_name = os.path.join("tokenized", family, subset) step = ExecutorStep( name=output_name, diff --git a/lib/marin/src/marin/datakit/download/nemotron_v2.py b/lib/marin/src/marin/datakit/download/nemotron_v2.py new file mode 100644 index 0000000000..60b4f7902b --- /dev/null +++ b/lib/marin/src/marin/datakit/download/nemotron_v2.py @@ -0,0 +1,122 @@ +# Copyright The Marin Authors +# SPDX-License-Identifier: Apache-2.0 + +"""Nemotron v2 pre-training dataset download definitions. + +These datasets come from the nvidia/Nemotron-Pre-Training-Datasets collection +on HuggingFace. They are additive to the original Nemotron-CC (v1) dataset. + +Most of these datasets are gated and require HF_TOKEN at download time. +All use parquet format with a "text" field. +""" + +from dataclasses import dataclass, field + +from marin.datakit.download.huggingface import download_hf_step +from marin.execution.step_spec import StepSpec + + +@dataclass(frozen=True) +class NemotronV2Dataset: + """Metadata for a single Nemotron v2 HuggingFace dataset.""" + + hf_dataset_id: str + revision: str + subsets: dict[str, str] = field(default_factory=dict) + """Maps subset_name -> glob pattern for parquet files within the download.""" + + +NEMOTRON_V2_DATASETS: dict[str, NemotronV2Dataset] = { + "nemotron_cc_v2": NemotronV2Dataset( + hf_dataset_id="nvidia/Nemotron-CC-v2", + revision="229a2e7", + subsets={ + "diverse_qa": "Diverse-QA/**/*.parquet", + "high_quality": "High-Quality/**/*.parquet", + "high_quality_synthetic": "High-Quality-Synthetic/**/*.parquet", + "medium_high_quality": "Medium-High-Quality/**/*.parquet", + "medium_quality": "Medium-Quality/**/*.parquet", + "translated_diverse_qa": "Translated-Diverse-QA/**/*.parquet", + }, + ), + "nemotron_cc_v2_1": NemotronV2Dataset( + hf_dataset_id="nvidia/Nemotron-CC-v2.1", + revision="ba6f2aa", + subsets={ + "high_quality": "High-Quality/**/*.parquet", + "high_quality_dqa": "High-Quality-DQA/**/*.parquet", + "high_quality_synthetic": "High-Quality-Synthetic/**/*.parquet", + "high_quality_translated": "High-Quality-Translated-To-English/**/*.parquet", + "high_quality_translated_synthetic": "High-Quality-Translated-To-English-Synthetic/**/*.parquet", + "medium_high_quality": "Medium-High-Quality/**/*.parquet", + "medium_high_quality_synthetic": "Medium-High-Quality-Synthetic/**/*.parquet", + "medium_high_quality_translated": "Medium-High-Quality-Translated-To-English/**/*.parquet", + "medium_quality": "Medium-Quality/**/*.parquet", + }, + ), + "nemotron_cc_code_v1": NemotronV2Dataset( + hf_dataset_id="nvidia/Nemotron-CC-Code-v1", + revision="5c5bebc", + subsets={"all": "data/**/*.parquet"}, + ), + "nemotron_cc_math_v1": NemotronV2Dataset( + hf_dataset_id="nvidia/Nemotron-CC-Math-v1", + revision="397a250", + subsets={ + "3": "3/**/*.parquet", + "4plus": "4plus/**/*.parquet", + "4plus_mind": "4plus_MIND/**/*.parquet", + }, + ), + "nemotron_pretraining_code_v1": NemotronV2Dataset( + hf_dataset_id="nvidia/Nemotron-Pretraining-Code-v1", + revision="01393d3", + subsets={ + "synthetic_code": "Synthetic-Code/**/*.parquet", + "code_metadata": "Nemotron-Code-Metadata/**/*.parquet", + }, + ), + "nemotron_pretraining_code_v2": NemotronV2Dataset( + hf_dataset_id="nvidia/Nemotron-Pretraining-Code-v2", + revision="7b1a453", + subsets={ + "code_metadata": "Nemotron-Code-Metadata/**/*.parquet", + "synthetic_question_answering": "Synthetic-Question-Answering/**/*.parquet", + "synthetic_student_teacher": "Synthetic-Student-Teacher/**/*.parquet", + "synthetic_code_review": "Synthetic-Code-Review/**/*.parquet", + "synthetic_rewriting": "Synthetic-Rewriting/**/*.parquet", + "synthetic_transpilation": "Synthetic-Transpilation/**/*.parquet", + }, + ), + "nemotron_pretraining_specialized_v1": NemotronV2Dataset( + hf_dataset_id="nvidia/Nemotron-Pretraining-Specialized-v1", + revision="9ed3718", + subsets={ + "wiki_rewrite": "Nemotron-Pretraining-Wiki-Rewrite/**/*.parquet", + "math_textbooks": "Nemotron-Pretraining-Math-Textbooks/**/*.parquet", + "stem_sft": "Nemotron-Pretraining-STEM-SFT/**/*.parquet", + "scientific_coding": "Nemotron-Pretraining-Scientific-Coding/**/*.parquet", + "rqa": "Nemotron-Pretraining-RQA/**/*.parquet", + "infinibyte_reasoning": "Nemotron-Pretraining-InfiniByte-Reasoning/**/*.parquet", + }, + ), + "nemotron_pretraining_sft_v1": NemotronV2Dataset( + hf_dataset_id="nvidia/Nemotron-Pretraining-SFT-v1", + revision="3f1a5b8", + subsets={ + "sft_code": "Nemotron-SFT-Code/**/*.parquet", + "sft_general": "Nemotron-SFT-General/**/*.parquet", + "sft_math": "Nemotron-SFT-MATH/**/*.parquet", + }, + ), +} + + +def nemotron_v2_download_step(family: str) -> StepSpec: + """Create a download StepSpec for a Nemotron v2 dataset family.""" + info = NEMOTRON_V2_DATASETS[family] + return download_hf_step( + f"raw/{family}", + hf_dataset_id=info.hf_dataset_id, + revision=info.revision, + ) From d03a03a1096cf3872ba6b3a5ae59af6a03b27a04 Mon Sep 17 00:00:00 2001 From: Rafal Wojdyla Date: Wed, 25 Mar 2026 15:18:22 -0700 Subject: [PATCH 27/56] Rename nemotron step functions for consistency download_nemotron_cc_step -> download_nemotron_v1_step nemotron_v2_download_step -> download_nemotron_v2_step Co-Authored-By: Claude Opus 4.6 (1M context) --- experiments/pretraining_datasets/nemotron.py | 4 ++-- experiments/pretraining_datasets/nemotron_v2.py | 4 ++-- lib/marin/src/marin/datakit/download/nemotron_v1.py | 2 +- lib/marin/src/marin/datakit/download/nemotron_v2.py | 2 +- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/experiments/pretraining_datasets/nemotron.py b/experiments/pretraining_datasets/nemotron.py index 35d3d86e2b..22fc4d1efa 100644 --- a/experiments/pretraining_datasets/nemotron.py +++ b/experiments/pretraining_datasets/nemotron.py @@ -8,14 +8,14 @@ from experiments.defaults import DEFAULT_NEW_RUN_DATA_SHUFFLE from experiments.pretraining_datasets.dclm import dclm_components_llama3 -from marin.datakit.download.nemotron_v1 import download_nemotron_cc_step +from marin.datakit.download.nemotron_v1 import download_nemotron_v1_step from marin.execution.executor import ExecutorStep, output_path_of, this_output_path, versioned from marin.processing.tokenize import TokenizeConfig, lm_mixture_data_config, tokenize from marin.processing.tokenize.data_configs import TokenizerStep # Raw dataset download step downloads = { - "nemotron_cc": download_nemotron_cc_step("raw/nemotro-cc").as_executor_step(), + "nemotron_cc": download_nemotron_v1_step("raw/nemotro-cc").as_executor_step(), } _nemotron_cc_path = output_path_of(downloads["nemotron_cc"], "contrib/Nemotron/Nemotron-CC/data-jsonl/") diff --git a/experiments/pretraining_datasets/nemotron_v2.py b/experiments/pretraining_datasets/nemotron_v2.py index b3cd1d6760..980b5edc90 100644 --- a/experiments/pretraining_datasets/nemotron_v2.py +++ b/experiments/pretraining_datasets/nemotron_v2.py @@ -10,7 +10,7 @@ import os.path -from marin.datakit.download.nemotron_v2 import NEMOTRON_V2_DATASETS, nemotron_v2_download_step +from marin.datakit.download.nemotron_v2 import NEMOTRON_V2_DATASETS, download_nemotron_v2_step from marin.execution.executor import ExecutorStep, this_output_path, versioned from marin.processing.tokenize import TokenizeConfig, tokenize from marin.processing.tokenize.data_configs import TokenizerStep @@ -20,7 +20,7 @@ # ============================================================================ downloads: dict[str, ExecutorStep] = { - family: nemotron_v2_download_step(family).as_executor_step() for family in NEMOTRON_V2_DATASETS + family: download_nemotron_v2_step(family).as_executor_step() for family in NEMOTRON_V2_DATASETS } diff --git a/lib/marin/src/marin/datakit/download/nemotron_v1.py b/lib/marin/src/marin/datakit/download/nemotron_v1.py index 8ba11e95b1..0befbf1883 100644 --- a/lib/marin/src/marin/datakit/download/nemotron_v1.py +++ b/lib/marin/src/marin/datakit/download/nemotron_v1.py @@ -108,7 +108,7 @@ def download_nemotron_cc(output_path: str) -> None: logger.info(f"Downloaded Nemotron CC files to {output_path}") -def download_nemotron_cc_step(name: str = "raw/nemotron-cc") -> StepSpec: +def download_nemotron_v1_step(name: str = "raw/nemotron-cc") -> StepSpec: """Create a StepSpec that downloads the Nemotron-CC dataset from Common Crawl.""" return StepSpec( diff --git a/lib/marin/src/marin/datakit/download/nemotron_v2.py b/lib/marin/src/marin/datakit/download/nemotron_v2.py index 60b4f7902b..4c31f81ffa 100644 --- a/lib/marin/src/marin/datakit/download/nemotron_v2.py +++ b/lib/marin/src/marin/datakit/download/nemotron_v2.py @@ -112,7 +112,7 @@ class NemotronV2Dataset: } -def nemotron_v2_download_step(family: str) -> StepSpec: +def download_nemotron_v2_step(family: str) -> StepSpec: """Create a download StepSpec for a Nemotron v2 dataset family.""" info = NEMOTRON_V2_DATASETS[family] return download_hf_step( From 39fe0d16624882dc986cc7d01325047212126dca Mon Sep 17 00:00:00 2001 From: Rafal Wojdyla Date: Wed, 25 Mar 2026 15:19:30 -0700 Subject: [PATCH 28/56] Remove unnecessary __all__ from uncheatable_eval module Co-Authored-By: Claude Opus 4.6 (1M context) --- lib/marin/src/marin/datakit/download/uncheatable_eval.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/lib/marin/src/marin/datakit/download/uncheatable_eval.py b/lib/marin/src/marin/datakit/download/uncheatable_eval.py index 0bcdef3439..f009ba158c 100644 --- a/lib/marin/src/marin/datakit/download/uncheatable_eval.py +++ b/lib/marin/src/marin/datakit/download/uncheatable_eval.py @@ -427,12 +427,3 @@ def make_uncheatable_eval_step( github_token=github_token, skip_existing=skip_existing, ).as_executor_step() - - -__all__ = [ - "UncheatableEvalDataset", - "UncheatableEvalDownloadConfig", - "download_latest_uncheatable_eval", - "make_uncheatable_eval_step", - "uncheatable_eval_step", -] From c53407f5ff0ba04eaf2104bddb82d3299b6bdce4 Mon Sep 17 00:00:00 2001 From: Rafal Wojdyla Date: Wed, 25 Mar 2026 15:20:46 -0700 Subject: [PATCH 29/56] Remove unused wikipedia_step function MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit No consumers — download() is called directly via ExecutorStep/CLI. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../src/marin/datakit/download/wikipedia.py | 25 ------------------- 1 file changed, 25 deletions(-) diff --git a/lib/marin/src/marin/datakit/download/wikipedia.py b/lib/marin/src/marin/datakit/download/wikipedia.py index 1dce125a0f..cfd919fae6 100644 --- a/lib/marin/src/marin/datakit/download/wikipedia.py +++ b/lib/marin/src/marin/datakit/download/wikipedia.py @@ -35,7 +35,6 @@ import draccus import requests from iris.marin_fs import open_url -from marin.execution.step_spec import StepSpec from marin.utils import fsspec_size from tqdm_loggable.auto import tqdm from zephyr import Dataset, ZephyrContext, atomic_rename, load_jsonl @@ -124,27 +123,3 @@ def download(cfg: WikipediaDownloadConfig) -> None: ) logger.info("Wikipedia dump transfer complete, wrote: %s", list(extracted)) - - -def wikipedia_step( - name: str = "raw/wikipedia", - *, - input_urls: list[str], - revision: str, - deps: list[StepSpec] | None = None, - output_path_prefix: str | None = None, - override_output_path: str | None = None, -) -> StepSpec: - """Create a StepSpec that downloads and processes Wikipedia HTML dumps.""" - - def _run(output_path: str) -> None: - download(WikipediaDownloadConfig(input_urls=input_urls, revision=revision, output_path=output_path)) - - return StepSpec( - name=name, - fn=_run, - deps=deps or [], - hash_attrs={"input_urls": input_urls, "revision": revision}, - output_path_prefix=output_path_prefix, - override_output_path=override_output_path, - ) From a85e541f3fa839876e3d46cd71d927ec060b87be Mon Sep 17 00:00:00 2001 From: Rafal Wojdyla Date: Wed, 25 Mar 2026 15:24:07 -0700 Subject: [PATCH 30/56] Flatten wikipedia download to plain parameters, remove draccus CLI download_wikipedia() now takes (input_urls, revision, output_path) directly. Removes WikipediaDownloadConfig, draccus decorator, and CLI entry point. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../src/marin/datakit/download/wikipedia.py | 26 +++++-------------- 1 file changed, 7 insertions(+), 19 deletions(-) diff --git a/lib/marin/src/marin/datakit/download/wikipedia.py b/lib/marin/src/marin/datakit/download/wikipedia.py index cfd919fae6..ec51c62b0b 100644 --- a/lib/marin/src/marin/datakit/download/wikipedia.py +++ b/lib/marin/src/marin/datakit/download/wikipedia.py @@ -2,8 +2,6 @@ # SPDX-License-Identifier: Apache-2.0 """ -wikipedia/download.py - Download script for the Wikipedia raw HTML data, provided by Wikimedia. Home Page: https://dumps.wikimedia.org/other/enterprise_html/runs/ @@ -11,14 +9,14 @@ Example Usage (production, large dataset): ENWIKI=https://dumps.wikimedia.org/other/enterprise_html/runs/20250320/enwiki-NS0-20250320-ENTERPRISE-HTML.json.tar.gz uv run zephyr --backend=ray --max-parallelism=10 \ - lib/marin/src/marin/download/wikipedia/download.py \ + lib/marin/src/marin/datakit/download/wikipedia.py \ --input_urls $ENWIKI \ --revision 20250320 --output_path gs://path/to/output Example Usage (local testing, small dataset): SIMPLEWIKI=https://dumps.wikimedia.org/other/enterprise_html/runs/20250320/simplewiki-NS0-20250320-ENTERPRISE-HTML.json.tar.gz -uv run zephyr --backend=threadpool --max-parallelism=4 --entry-point=download \ - lib/marin/src/marin/download/wikipedia/download.py \ +uv run zephyr --backend=threadpool --max-parallelism=4 --entry-point=main \ + lib/marin/src/marin/datakit/download/wikipedia.py \ --input_urls "[$SIMPLEWIKI]" \ --revision 20250320 --output_path /tmp/wikipedia_test @@ -30,9 +28,7 @@ import os import tarfile from collections.abc import Iterable -from dataclasses import dataclass -import draccus import requests from iris.marin_fs import open_url from marin.utils import fsspec_size @@ -42,14 +38,7 @@ logger = logging.getLogger(__name__) -@dataclass -class WikipediaDownloadConfig: - input_urls: list[str] - revision: str - output_path: str - - -def download_tar(url: str, output_prefix) -> str: +def download_tar(url: str, output_prefix: str) -> str: shard_filename = url.split("/")[-1] output_filename = os.path.join(output_prefix, shard_filename) logger.info(f"Downloading URL: {url} to {output_filename}") @@ -100,15 +89,14 @@ def process_file(input_file: str, output_path: str) -> Iterable[str]: raise e -@draccus.wrap() -def download(cfg: WikipediaDownloadConfig) -> None: +def download_wikipedia(input_urls: list[str], revision: str, output_path: str) -> None: """Download and process Wikipedia data.""" logger.info("Starting transfer of Wikipedia dump...") - output_base = os.path.join(cfg.output_path, cfg.revision) + output_base = os.path.join(output_path, revision) ctx = ZephyrContext(name="download-wikipedia") download_metrics = ctx.execute( - Dataset.from_list(cfg.input_urls) + Dataset.from_list(input_urls) .map(lambda url: download_tar(url, output_base)) .write_jsonl(f"{output_base}/.metrics/download-{{shard:05d}}.jsonl", skip_existing=True), ) From e7ac5bec38516d1a0dbe906050a587e4d606dda8 Mon Sep 17 00:00:00 2001 From: Rafal Wojdyla Date: Wed, 25 Mar 2026 15:25:56 -0700 Subject: [PATCH 31/56] Remove unused draccus CLI from huggingface download module Not invoked anywhere in the codebase. Co-Authored-By: Claude Opus 4.6 (1M context) --- lib/marin/src/marin/datakit/download/huggingface.py | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/lib/marin/src/marin/datakit/download/huggingface.py b/lib/marin/src/marin/datakit/download/huggingface.py index 6a6ff13cd2..fff532a017 100644 --- a/lib/marin/src/marin/datakit/download/huggingface.py +++ b/lib/marin/src/marin/datakit/download/huggingface.py @@ -14,7 +14,6 @@ import time from dataclasses import dataclass, field -import draccus import huggingface_hub from huggingface_hub import HfFileSystem from iris.marin_fs import open_url, url_to_fs @@ -397,13 +396,3 @@ def _run(output_path: str) -> None: output_path_prefix=output_path_prefix, override_output_path=override_output_path, ) - - -@draccus.wrap() -def main(cfg: DownloadConfig) -> None: - """Download HuggingFace dataset.""" - download_hf(cfg) - - -if __name__ == "__main__": - main() From 0c7587aa8c9ee40566f6975d5a93c42603b5f9a2 Mon Sep 17 00:00:00 2001 From: Rafal Wojdyla Date: Wed, 25 Mar 2026 15:28:04 -0700 Subject: [PATCH 32/56] Remove backward-compat aliases from datakit/download/__init__.py Clean up the download_step alias and __all__. The one consumer (test_datakit.py) now imports download_hf_step directly. Co-Authored-By: Claude Opus 4.6 (1M context) --- lib/marin/src/marin/datakit/download/__init__.py | 16 ---------------- tests/datakit/test_datakit.py | 4 ++-- 2 files changed, 2 insertions(+), 18 deletions(-) diff --git a/lib/marin/src/marin/datakit/download/__init__.py b/lib/marin/src/marin/datakit/download/__init__.py index cc14fdbdf4..ec8bc038b7 100644 --- a/lib/marin/src/marin/datakit/download/__init__.py +++ b/lib/marin/src/marin/datakit/download/__init__.py @@ -1,18 +1,2 @@ # Copyright The Marin Authors # SPDX-License-Identifier: Apache-2.0 - -from marin.datakit.download.huggingface import ( - DownloadConfig, - download_hf, - download_hf_step, -) - -# Backward-compat alias: download_step was the original name in the single-file module. -download_step = download_hf_step - -__all__ = [ - "DownloadConfig", - "download_hf", - "download_hf_step", - "download_step", -] diff --git a/tests/datakit/test_datakit.py b/tests/datakit/test_datakit.py index 1c29e35a9c..0c126e6074 100644 --- a/tests/datakit/test_datakit.py +++ b/tests/datakit/test_datakit.py @@ -10,7 +10,7 @@ import pytest from levanter.store.cache import CacheLedger, TreeCache -from marin.datakit.download import download_step +from marin.datakit.download.huggingface import download_hf_step from marin.datakit.normalize import content_hash_id, normalize_step from marin.datakit.tokenize import tokenize_step from marin.execution.step_runner import StepRunner @@ -20,7 +20,7 @@ def test_download_normalize_tokenize(tmp_path): """Download → normalize → tokenize as a StepSpec DAG via StepRunner.""" - dl = download_step( + dl = download_hf_step( "datakit/download", hf_dataset_id="wikitext", revision="main", From aa2252d81b26a841567fdc8c86c7e7fdb525c1a0 Mon Sep 17 00:00:00 2001 From: Rafal Wojdyla Date: Wed, 25 Mar 2026 15:29:44 -0700 Subject: [PATCH 33/56] Remove output_path_prefix from download_hf_step MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Simplify the API — override_output_path with relative path support (auto-prefixed by marin_prefix) is sufficient. Co-Authored-By: Claude Opus 4.6 (1M context) --- lib/marin/src/marin/datakit/download/huggingface.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/lib/marin/src/marin/datakit/download/huggingface.py b/lib/marin/src/marin/datakit/download/huggingface.py index fff532a017..c414df96a9 100644 --- a/lib/marin/src/marin/datakit/download/huggingface.py +++ b/lib/marin/src/marin/datakit/download/huggingface.py @@ -351,7 +351,6 @@ def download_hf_step( hf_urls_glob: list[str] | None = None, zephyr_max_parallelism: int = 8, deps: list[StepSpec] | None = None, - output_path_prefix: str | None = None, override_output_path: str | None = None, ) -> StepSpec: """Create a StepSpec that downloads a HuggingFace dataset. @@ -365,7 +364,6 @@ def download_hf_step( hf_urls_glob: Glob patterns to select specific files. Empty means all files. zephyr_max_parallelism: Maximum download parallelism. deps: Optional upstream dependencies. - output_path_prefix: Override the default output path prefix. override_output_path: Override the computed output path entirely. Returns: @@ -393,6 +391,5 @@ def _run(output_path: str) -> None: "revision": revision, "hf_urls_glob": resolved_glob, }, - output_path_prefix=output_path_prefix, override_output_path=override_output_path, ) From 55565769a0fc7b69caa088e60f7bd0e58448a94a Mon Sep 17 00:00:00 2001 From: Rafal Wojdyla Date: Wed, 25 Mar 2026 15:42:44 -0700 Subject: [PATCH 34/56] Remove unused datakit/tokenize.py module Only consumer was the integration test, which now uses StepSpec with TokenizeConfig directly. Co-Authored-By: Claude Opus 4.6 (1M context) --- lib/marin/src/marin/datakit/tokenize.py | 71 ------------------------- tests/datakit/test_datakit.py | 19 +++++-- 2 files changed, 14 insertions(+), 76 deletions(-) delete mode 100644 lib/marin/src/marin/datakit/tokenize.py diff --git a/lib/marin/src/marin/datakit/tokenize.py b/lib/marin/src/marin/datakit/tokenize.py deleted file mode 100644 index 0e5c9b4168..0000000000 --- a/lib/marin/src/marin/datakit/tokenize.py +++ /dev/null @@ -1,71 +0,0 @@ -# Copyright The Marin Authors -# SPDX-License-Identifier: Apache-2.0 - -"""Datakit tokenize stage — convert normalized Parquet datasets into Levanter cache format. - -This is the final stage of the datakit pipeline. It reads normalized Parquet -files and produces tokenized training data in Levanter's TreeStore format. - -Tokenization is the boundary where per-document structure ends. The tokenizer -concatenates documents into fixed-size token sequences for efficient training. -""" - -import logging - -from marin.execution.step_spec import StepSpec -from marin.processing.tokenize.tokenize import TokenizeConfig, tokenize - -logger = logging.getLogger(__name__) - - -def tokenize_step( - name: str, - *, - input_path: str, - tokenizer: str, - max_workers: int = 4096, - deps: list[StepSpec] | None = None, - output_path_prefix: str | None = None, - override_output_path: str | None = None, -) -> StepSpec: - """Create a StepSpec that tokenizes a normalized dataset. - - Reads normalized Parquet files and produces Levanter cache format output - suitable for training. - - Args: - name: Step name (e.g. "fineweb/tokenize"). - input_path: Path to normalized Parquet files (output of normalize step). - tokenizer: HuggingFace tokenizer name (e.g. "meta-llama/Llama-3.1-8B"). - max_workers: Maximum Zephyr worker parallelism. - deps: Upstream dependencies (typically the normalize or consolidate step). - output_path_prefix: Override the default output path prefix. - override_output_path: Override the computed output path entirely. - - Returns: - A StepSpec whose output_path contains the tokenized Levanter cache. - """ - - def _run(output_path: str) -> None: - tokenize( - TokenizeConfig( - train_paths=[input_path], - validation_paths=[], - cache_path=output_path, - tokenizer=tokenizer, - max_workers=max_workers, - allow_test_in_train=True, - ) - ) - - return StepSpec( - name=name, - fn=_run, - deps=deps or [], - hash_attrs={ - "input_path": input_path, - "tokenizer": tokenizer, - }, - output_path_prefix=output_path_prefix, - override_output_path=override_output_path, - ) diff --git a/tests/datakit/test_datakit.py b/tests/datakit/test_datakit.py index 0c126e6074..184b0c6230 100644 --- a/tests/datakit/test_datakit.py +++ b/tests/datakit/test_datakit.py @@ -12,8 +12,9 @@ from marin.datakit.download.huggingface import download_hf_step from marin.datakit.normalize import content_hash_id, normalize_step -from marin.datakit.tokenize import tokenize_step from marin.execution.step_runner import StepRunner +from marin.execution.step_spec import StepSpec +from marin.processing.tokenize.tokenize import TokenizeConfig, tokenize @pytest.mark.slow @@ -35,11 +36,19 @@ def test_download_normalize_tokenize(tmp_path): override_output_path=str(tmp_path / "normalized"), ) - tok = tokenize_step( - "datakit/tokenize", - input_path=norm.output_path, - tokenizer="gpt2", + tok = StepSpec( + name="datakit/tokenize", + fn=lambda output_path: tokenize( + TokenizeConfig( + train_paths=[norm.output_path], + validation_paths=[], + cache_path=output_path, + tokenizer="gpt2", + allow_test_in_train=True, + ) + ), deps=[norm], + hash_attrs={"tokenizer": "gpt2"}, override_output_path=str(tmp_path / "tokenized"), ) From f2983ba0114fae42199d23b9412d5af8e3c0d7b7 Mon Sep 17 00:00:00 2001 From: Rafal Wojdyla Date: Wed, 25 Mar 2026 15:44:52 -0700 Subject: [PATCH 35/56] Remove unused datakit/normalize.py module MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit No production consumers. Simplify integration test to download → tokenize only. Co-Authored-By: Claude Opus 4.6 (1M context) --- lib/marin/src/marin/datakit/normalize.py | 194 ----------------------- tests/datakit/test_datakit.py | 36 +---- 2 files changed, 6 insertions(+), 224 deletions(-) delete mode 100644 lib/marin/src/marin/datakit/normalize.py diff --git a/lib/marin/src/marin/datakit/normalize.py b/lib/marin/src/marin/datakit/normalize.py deleted file mode 100644 index bace847696..0000000000 --- a/lib/marin/src/marin/datakit/normalize.py +++ /dev/null @@ -1,194 +0,0 @@ -# Copyright The Marin Authors -# SPDX-License-Identifier: Apache-2.0 - -"""Datakit normalize stage — convert raw data into the datakit standard Parquet format. - -The normalize step is the "intake" for the datakit pipeline. It reads raw files -(JSONL, Parquet, or other formats supported by Zephyr), enforces a standard -schema (mandatory ``id`` and ``text`` columns), and writes co-partitioned, -sorted Parquet files. - -Key guarantees after normalization: -- Every record has a deterministic ``id`` (SHA-256 of the text content). -- If the source data has an existing ID field, it is preserved as ``source_id``. -- Text is present and UTF-8 encoded. -- Each output partition is sorted by ``id``. -- Output files follow the ``part-{shard:05d}-of-{total:05d}.parquet`` naming convention. -""" - -import hashlib -import logging -import os -from collections.abc import Iterator - -from marin.execution.artifact import PathsMetadata -from marin.execution.step_spec import StepSpec -from marin.utils import fsspec_glob -from zephyr import Dataset, ShardInfo, ZephyrContext -from zephyr.readers import load_file - -logger = logging.getLogger(__name__) - -DEFAULT_TEXT_FIELD = "text" - - -def content_hash_id(text: str) -> str: - """Generate a deterministic document ID from text content. - - Uses SHA-256 truncated to 16 hex characters for a compact but - collision-resistant identifier. - """ - return hashlib.sha256(text.encode("utf-8")).hexdigest()[:16] - - -def _discover_input_files(input_path: str) -> list[str]: - """Find all supported input files under input_path, excluding dotfiles/directories.""" - extensions = ["jsonl.gz", "jsonl.zst", "jsonl.zstd", "jsonl", "parquet", "vortex"] - files: list[str] = [] - for ext in extensions: - files.extend(fsspec_glob(os.path.join(input_path, f"**/*.{ext}"))) - # Exclude hidden directories (e.g. .metrics/ written by download_hf) - files = [f for f in files if "/." not in f.split(input_path, 1)[-1]] - if not files: - raise ValueError(f"No supported input files found under {input_path}") - return sorted(files) - - -def _normalize_record(record: dict, text_field: str, source_id_field: str | None) -> dict: - """Transform a single record into datakit standard format. - - - Extracts and renames the text field to ``text``. - - Generates a deterministic ``id`` from the text content. - - Preserves the original ID (if any) as ``source_id``. - - Preserves all other fields. - """ - text = record.get(text_field) - if text is None: - raise ValueError(f"Record missing required text field {text_field!r}: {list(record.keys())}") - if not isinstance(text, str): - text = str(text) - - doc_id = content_hash_id(text) - - normalized: dict = {"id": doc_id, "text": text} - - if source_id_field is not None and source_id_field in record: - normalized["source_id"] = str(record[source_id_field]) - - # Preserve additional columns - skip_fields = {text_field, source_id_field} if source_id_field else {text_field} - for key, value in record.items(): - if key not in skip_fields and key not in normalized: - normalized[key] = value - - return normalized - - -def normalize( - input_path: str, - output_path: str, - *, - text_field: str = DEFAULT_TEXT_FIELD, - source_id_field: str | None = None, - num_output_shards: int | None = None, - zephyr_max_workers: int = 64, -) -> PathsMetadata: - """Run the normalize pipeline. - - Reads raw files, transforms each record to the standard schema, - repartitions by ``id`` (hash-based), deduplicates, sorts each partition - by ``id``, and writes Parquet output files. - - Args: - input_path: Path to raw input files. - output_path: Directory to write output Parquet files. - text_field: Name of the field containing the primary text content. - source_id_field: Name of an existing ID field to preserve as ``source_id``. - num_output_shards: Number of output Parquet partitions. Defaults to - the number of input files. - zephyr_max_workers: Maximum Zephyr worker parallelism. - - Returns: - PathsMetadata listing the output files. - """ - input_files = _discover_input_files(input_path) - logger.info("Normalizing %d input files from %s", len(input_files), input_path) - - shards = num_output_shards or len(input_files) - - def _sort_shard(records: Iterator[dict], _shard_info: ShardInfo) -> Iterator[dict]: - batch = list(records) - batch.sort(key=lambda r: r["id"]) - return iter(batch) - - output_pattern = os.path.join(output_path, "part-{shard:05d}-of-{total:05d}.parquet") - pipeline = ( - Dataset.from_list(input_files) - .flat_map(load_file) - .map(lambda r: _normalize_record(r, text_field, source_id_field)) - .group_by( - key=lambda r: r["id"], - reducer=lambda _key, records: next(iter(records)), - num_output_shards=shards, - ) - .map_shard(_sort_shard) - .write_parquet(output_pattern) - ) - - ctx = ZephyrContext(name="datakit-normalize", max_workers=min(zephyr_max_workers, shards)) - output_files = list(ctx.execute(pipeline)) - logger.info("Wrote %d normalized Parquet partitions to %s", len(output_files), output_path) - return PathsMetadata(parent_path=output_path, paths=output_files) - - -def normalize_step( - name: str, - *, - input_path: str, - text_field: str = DEFAULT_TEXT_FIELD, - source_id_field: str | None = None, - num_output_shards: int | None = None, - zephyr_max_workers: int = 64, - deps: list[StepSpec] | None = None, - output_path_prefix: str | None = None, - override_output_path: str | None = None, -) -> StepSpec: - """Create a StepSpec for the normalize stage. - - Args: - name: Step name (e.g. "fineweb/normalize"). - input_path: Path to raw input files. - text_field: Name of the field containing the primary text content. - source_id_field: Name of an existing ID field to preserve as ``source_id``. - num_output_shards: Number of output Parquet partitions. - zephyr_max_workers: Maximum Zephyr worker parallelism. - deps: Upstream dependencies (typically the download step). - output_path_prefix: Override the default output path prefix. - override_output_path: Override the computed output path entirely. - - Returns: - A StepSpec whose output_path contains normalized Parquet files. - """ - - def _run(step_output_path: str) -> PathsMetadata: - return normalize( - input_path, - step_output_path, - text_field=text_field, - source_id_field=source_id_field, - num_output_shards=num_output_shards, - zephyr_max_workers=zephyr_max_workers, - ) - - return StepSpec( - name=name, - fn=_run, - deps=deps or [], - hash_attrs={ - "input_path": input_path, - "text_field": text_field, - "source_id_field": source_id_field, - }, - output_path_prefix=output_path_prefix, - override_output_path=override_output_path, - ) diff --git a/tests/datakit/test_datakit.py b/tests/datakit/test_datakit.py index 184b0c6230..8a9286763b 100644 --- a/tests/datakit/test_datakit.py +++ b/tests/datakit/test_datakit.py @@ -1,25 +1,23 @@ # Copyright The Marin Authors # SPDX-License-Identifier: Apache-2.0 -"""Integration test for the datakit pipeline: download → normalize → tokenize, wired as StepSpecs.""" +"""Integration test for the datakit pipeline: download → tokenize, wired as StepSpecs.""" from pathlib import Path import numpy as np -import pyarrow.parquet as pq import pytest from levanter.store.cache import CacheLedger, TreeCache from marin.datakit.download.huggingface import download_hf_step -from marin.datakit.normalize import content_hash_id, normalize_step from marin.execution.step_runner import StepRunner from marin.execution.step_spec import StepSpec from marin.processing.tokenize.tokenize import TokenizeConfig, tokenize @pytest.mark.slow -def test_download_normalize_tokenize(tmp_path): - """Download → normalize → tokenize as a StepSpec DAG via StepRunner.""" +def test_download_and_tokenize(tmp_path): + """Download → tokenize as a StepSpec DAG via StepRunner.""" dl = download_hf_step( "datakit/download", @@ -29,50 +27,28 @@ def test_download_normalize_tokenize(tmp_path): override_output_path=str(tmp_path / "raw"), ) - norm = normalize_step( - "datakit/normalize", - input_path=dl.output_path, - deps=[dl], - override_output_path=str(tmp_path / "normalized"), - ) - tok = StepSpec( name="datakit/tokenize", fn=lambda output_path: tokenize( TokenizeConfig( - train_paths=[norm.output_path], + train_paths=[dl.output_path], validation_paths=[], cache_path=output_path, tokenizer="gpt2", allow_test_in_train=True, ) ), - deps=[norm], + deps=[dl], hash_attrs={"tokenizer": "gpt2"}, override_output_path=str(tmp_path / "tokenized"), ) - StepRunner().run([dl, norm, tok]) + StepRunner().run([dl, tok]) # -- Verify download output -- raw_files = [f for f in Path(dl.output_path).rglob("*") if f.is_file() and not f.name.startswith(".")] assert len(raw_files) >= 1 - # -- Verify normalize output -- - parquet_files = sorted(Path(norm.output_path).glob("*.parquet")) - assert len(parquet_files) >= 1 - - all_records = [] - for pf in parquet_files: - records = pq.read_table(str(pf)).to_pylist() - all_records.extend(records) - ids = [r["id"] for r in records] - assert ids == sorted(ids), f"Partition {pf.name} not sorted by id" - - assert len(all_records) > 0 - for record in all_records: - assert record["id"] == content_hash_id(record["text"]) - # -- Verify tokenize output -- train_dir = Path(tok.output_path) / "train" ledger = CacheLedger.load(str(train_dir)) From 764e117823fce6c2f68584d852e339c6e1b5832b Mon Sep 17 00:00:00 2001 From: Rafal Wojdyla Date: Wed, 25 Mar 2026 15:50:18 -0700 Subject: [PATCH 36/56] Move tests/download/ to tests/datakit/download/ Mirrors the source code location at marin.datakit.download.*. Co-Authored-By: Claude Opus 4.6 (1M context) --- tests/datakit/{ => download}/__init__.py | 0 tests/{ => datakit}/download/conftest.py | 0 tests/{ => datakit}/download/test_ar5iv.py | 0 tests/{ => datakit}/download/test_dclm_hq.py | 0 .../download/test_huggingface.py | 0 .../download/test_nemotron_cc.py | 0 tests/datakit/test_datakit.py | 61 ------------------- 7 files changed, 61 deletions(-) rename tests/datakit/{ => download}/__init__.py (100%) rename tests/{ => datakit}/download/conftest.py (100%) rename tests/{ => datakit}/download/test_ar5iv.py (100%) rename tests/{ => datakit}/download/test_dclm_hq.py (100%) rename tests/{ => datakit}/download/test_huggingface.py (100%) rename tests/{ => datakit}/download/test_nemotron_cc.py (100%) delete mode 100644 tests/datakit/test_datakit.py diff --git a/tests/datakit/__init__.py b/tests/datakit/download/__init__.py similarity index 100% rename from tests/datakit/__init__.py rename to tests/datakit/download/__init__.py diff --git a/tests/download/conftest.py b/tests/datakit/download/conftest.py similarity index 100% rename from tests/download/conftest.py rename to tests/datakit/download/conftest.py diff --git a/tests/download/test_ar5iv.py b/tests/datakit/download/test_ar5iv.py similarity index 100% rename from tests/download/test_ar5iv.py rename to tests/datakit/download/test_ar5iv.py diff --git a/tests/download/test_dclm_hq.py b/tests/datakit/download/test_dclm_hq.py similarity index 100% rename from tests/download/test_dclm_hq.py rename to tests/datakit/download/test_dclm_hq.py diff --git a/tests/download/test_huggingface.py b/tests/datakit/download/test_huggingface.py similarity index 100% rename from tests/download/test_huggingface.py rename to tests/datakit/download/test_huggingface.py diff --git a/tests/download/test_nemotron_cc.py b/tests/datakit/download/test_nemotron_cc.py similarity index 100% rename from tests/download/test_nemotron_cc.py rename to tests/datakit/download/test_nemotron_cc.py diff --git a/tests/datakit/test_datakit.py b/tests/datakit/test_datakit.py deleted file mode 100644 index 8a9286763b..0000000000 --- a/tests/datakit/test_datakit.py +++ /dev/null @@ -1,61 +0,0 @@ -# Copyright The Marin Authors -# SPDX-License-Identifier: Apache-2.0 - -"""Integration test for the datakit pipeline: download → tokenize, wired as StepSpecs.""" - -from pathlib import Path - -import numpy as np -import pytest -from levanter.store.cache import CacheLedger, TreeCache - -from marin.datakit.download.huggingface import download_hf_step -from marin.execution.step_runner import StepRunner -from marin.execution.step_spec import StepSpec -from marin.processing.tokenize.tokenize import TokenizeConfig, tokenize - - -@pytest.mark.slow -def test_download_and_tokenize(tmp_path): - """Download → tokenize as a StepSpec DAG via StepRunner.""" - - dl = download_hf_step( - "datakit/download", - hf_dataset_id="wikitext", - revision="main", - hf_urls_glob=["wikitext-2-v1/test-*.parquet"], - override_output_path=str(tmp_path / "raw"), - ) - - tok = StepSpec( - name="datakit/tokenize", - fn=lambda output_path: tokenize( - TokenizeConfig( - train_paths=[dl.output_path], - validation_paths=[], - cache_path=output_path, - tokenizer="gpt2", - allow_test_in_train=True, - ) - ), - deps=[dl], - hash_attrs={"tokenizer": "gpt2"}, - override_output_path=str(tmp_path / "tokenized"), - ) - - StepRunner().run([dl, tok]) - - # -- Verify download output -- - raw_files = [f for f in Path(dl.output_path).rglob("*") if f.is_file() and not f.name.startswith(".")] - assert len(raw_files) >= 1 - - # -- Verify tokenize output -- - train_dir = Path(tok.output_path) / "train" - ledger = CacheLedger.load(str(train_dir)) - assert ledger.is_finished - assert ledger.total_num_rows > 0 - - exemplar = {"input_ids": np.array([0], dtype=np.int32)} - cache = TreeCache.load(str(train_dir), exemplar=exemplar) - assert len(cache) == ledger.total_num_rows - assert len(cache[0]["input_ids"]) > 0 From 05d58e29104cbff8ea18ff7da4552d53b0d52ef0 Mon Sep 17 00:00:00 2001 From: Rafal Wojdyla Date: Wed, 25 Mar 2026 15:50:59 -0700 Subject: [PATCH 37/56] Restore tests/datakit/__init__.py and test_datakit.py The previous commit accidentally removed these when moving the download tests. Co-Authored-By: Claude Opus 4.6 (1M context) --- tests/datakit/__init__.py | 2 ++ tests/datakit/test_datakit.py | 61 +++++++++++++++++++++++++++++++++++ 2 files changed, 63 insertions(+) create mode 100644 tests/datakit/__init__.py create mode 100644 tests/datakit/test_datakit.py diff --git a/tests/datakit/__init__.py b/tests/datakit/__init__.py new file mode 100644 index 0000000000..ec8bc038b7 --- /dev/null +++ b/tests/datakit/__init__.py @@ -0,0 +1,2 @@ +# Copyright The Marin Authors +# SPDX-License-Identifier: Apache-2.0 diff --git a/tests/datakit/test_datakit.py b/tests/datakit/test_datakit.py new file mode 100644 index 0000000000..8a9286763b --- /dev/null +++ b/tests/datakit/test_datakit.py @@ -0,0 +1,61 @@ +# Copyright The Marin Authors +# SPDX-License-Identifier: Apache-2.0 + +"""Integration test for the datakit pipeline: download → tokenize, wired as StepSpecs.""" + +from pathlib import Path + +import numpy as np +import pytest +from levanter.store.cache import CacheLedger, TreeCache + +from marin.datakit.download.huggingface import download_hf_step +from marin.execution.step_runner import StepRunner +from marin.execution.step_spec import StepSpec +from marin.processing.tokenize.tokenize import TokenizeConfig, tokenize + + +@pytest.mark.slow +def test_download_and_tokenize(tmp_path): + """Download → tokenize as a StepSpec DAG via StepRunner.""" + + dl = download_hf_step( + "datakit/download", + hf_dataset_id="wikitext", + revision="main", + hf_urls_glob=["wikitext-2-v1/test-*.parquet"], + override_output_path=str(tmp_path / "raw"), + ) + + tok = StepSpec( + name="datakit/tokenize", + fn=lambda output_path: tokenize( + TokenizeConfig( + train_paths=[dl.output_path], + validation_paths=[], + cache_path=output_path, + tokenizer="gpt2", + allow_test_in_train=True, + ) + ), + deps=[dl], + hash_attrs={"tokenizer": "gpt2"}, + override_output_path=str(tmp_path / "tokenized"), + ) + + StepRunner().run([dl, tok]) + + # -- Verify download output -- + raw_files = [f for f in Path(dl.output_path).rglob("*") if f.is_file() and not f.name.startswith(".")] + assert len(raw_files) >= 1 + + # -- Verify tokenize output -- + train_dir = Path(tok.output_path) / "train" + ledger = CacheLedger.load(str(train_dir)) + assert ledger.is_finished + assert ledger.total_num_rows > 0 + + exemplar = {"input_ids": np.array([0], dtype=np.int32)} + cache = TreeCache.load(str(train_dir), exemplar=exemplar) + assert len(cache) == ledger.total_num_rows + assert len(cache[0]["input_ids"]) > 0 From bed10156a0356c1b7c90bdd75ca1680795a2e758 Mon Sep 17 00:00:00 2001 From: Rafal Wojdyla Date: Wed, 25 Mar 2026 16:07:26 -0700 Subject: [PATCH 38/56] Replace nemotron downloads dict with nemotron_cc_download variable The single-entry dict was unnecessary indirection. All consumers updated to reference the variable directly. Co-Authored-By: Claude Opus 4.6 (1M context) --- experiments/pretraining_datasets/__init__.py | 4 ++-- experiments/pretraining_datasets/nemotron.py | 7 ++----- experiments/train_test_overlap/train_test_total.py | 4 ++-- lib/marin/src/marin/datakit/download/nemotron_v1.py | 6 ++++-- 4 files changed, 10 insertions(+), 11 deletions(-) diff --git a/experiments/pretraining_datasets/__init__.py b/experiments/pretraining_datasets/__init__.py index 6ca2bff80f..79d651252b 100644 --- a/experiments/pretraining_datasets/__init__.py +++ b/experiments/pretraining_datasets/__init__.py @@ -37,7 +37,7 @@ NEMOTRON_DATASETS, NEMOTRON_LLAMA3_OVERRIDES, NEMOTRON_WEIGHTS, - downloads as nemotron_downloads, + nemotron_cc_download, nemotron_mix, nemotron_mix_block_shuffle, tokenize_nemotron, @@ -119,7 +119,7 @@ }, "nemotron_cc": { "subsets": list(NEMOTRON_DATASETS.keys()), - "download": nemotron_downloads["nemotron_cc"], + "download": nemotron_cc_download, "tokenize_fn": tokenize_nemotron, }, "dolma": { diff --git a/experiments/pretraining_datasets/nemotron.py b/experiments/pretraining_datasets/nemotron.py index 22fc4d1efa..b4211bd5aa 100644 --- a/experiments/pretraining_datasets/nemotron.py +++ b/experiments/pretraining_datasets/nemotron.py @@ -13,12 +13,9 @@ from marin.processing.tokenize import TokenizeConfig, lm_mixture_data_config, tokenize from marin.processing.tokenize.data_configs import TokenizerStep -# Raw dataset download step -downloads = { - "nemotron_cc": download_nemotron_v1_step("raw/nemotro-cc").as_executor_step(), -} +nemotron_cc_download = download_nemotron_v1_step().as_executor_step() -_nemotron_cc_path = output_path_of(downloads["nemotron_cc"], "contrib/Nemotron/Nemotron-CC/data-jsonl/") +_nemotron_cc_path = output_path_of(nemotron_cc_download, "contrib/Nemotron/Nemotron-CC/data-jsonl/") NEMOTRON_DATASETS = { "hq_actual": ["quality=high/kind=actual/**/*.jsonl.*"], diff --git a/experiments/train_test_overlap/train_test_total.py b/experiments/train_test_overlap/train_test_total.py index e08dbfb4f2..92387dd61d 100644 --- a/experiments/train_test_overlap/train_test_total.py +++ b/experiments/train_test_overlap/train_test_total.py @@ -37,7 +37,7 @@ from experiments.midtraining_datasets import finemath_3_plus from experiments.pretraining_datasets.simple import downloads from experiments.pretraining_datasets.dolmino import downloads as dolmino_downloads -from experiments.pretraining_datasets.nemotron import downloads as nemotron_downloads +from experiments.pretraining_datasets.nemotron import nemotron_cc_download from experiments.train_test_overlap.eval_datasets_overlap import EVAL_DATASET_STEPS logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s") @@ -81,7 +81,7 @@ def run_train_test_overlap(config: DeconConfig) -> str: DatasetConfig(name="starcoder", path=downloads["starcoderdata"], text_field="content"), DatasetConfig(name="proofpile", path=downloads["proofpile_2"]), DatasetConfig(name="dolmino", path=dolmino_downloads["dolmino"]), - DatasetConfig(name="nemotron_cc", path=nemotron_downloads["nemotron_cc"]), + DatasetConfig(name="nemotron_cc", path=nemotron_cc_download), ] diff --git a/lib/marin/src/marin/datakit/download/nemotron_v1.py b/lib/marin/src/marin/datakit/download/nemotron_v1.py index 0befbf1883..27a267b38d 100644 --- a/lib/marin/src/marin/datakit/download/nemotron_v1.py +++ b/lib/marin/src/marin/datakit/download/nemotron_v1.py @@ -108,10 +108,12 @@ def download_nemotron_cc(output_path: str) -> None: logger.info(f"Downloaded Nemotron CC files to {output_path}") -def download_nemotron_v1_step(name: str = "raw/nemotron-cc") -> StepSpec: +def download_nemotron_v1_step() -> StepSpec: """Create a StepSpec that downloads the Nemotron-CC dataset from Common Crawl.""" return StepSpec( - name=name, + name="raw/nemotron_v1", fn=lambda output_path: download_nemotron_cc(output_path=output_path), + # NOTE: use the existing output to avoid re-downloading. Yes this is mssing the `n`. + override_output_path="raw/nemotro-cc-eeb783", ) From debf1fe0e1821442d8fd103586c850a0a4c1e79c Mon Sep 17 00:00:00 2001 From: Rafal Wojdyla Date: Wed, 25 Mar 2026 16:09:46 -0700 Subject: [PATCH 39/56] Replace nemotron_cc_download global with a function Inline _nemotron_cc_path into its only caller. All consumers now call nemotron_cc_download() instead of referencing a global. Co-Authored-By: Claude Opus 4.6 (1M context) --- experiments/pretraining_datasets/__init__.py | 2 +- experiments/pretraining_datasets/nemotron.py | 9 +++++---- experiments/train_test_overlap/train_test_total.py | 2 +- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/experiments/pretraining_datasets/__init__.py b/experiments/pretraining_datasets/__init__.py index 79d651252b..93e9ffddf0 100644 --- a/experiments/pretraining_datasets/__init__.py +++ b/experiments/pretraining_datasets/__init__.py @@ -119,7 +119,7 @@ }, "nemotron_cc": { "subsets": list(NEMOTRON_DATASETS.keys()), - "download": nemotron_cc_download, + "download": nemotron_cc_download(), "tokenize_fn": tokenize_nemotron, }, "dolma": { diff --git a/experiments/pretraining_datasets/nemotron.py b/experiments/pretraining_datasets/nemotron.py index b4211bd5aa..d822c4dd5b 100644 --- a/experiments/pretraining_datasets/nemotron.py +++ b/experiments/pretraining_datasets/nemotron.py @@ -13,9 +13,10 @@ from marin.processing.tokenize import TokenizeConfig, lm_mixture_data_config, tokenize from marin.processing.tokenize.data_configs import TokenizerStep -nemotron_cc_download = download_nemotron_v1_step().as_executor_step() -_nemotron_cc_path = output_path_of(nemotron_cc_download, "contrib/Nemotron/Nemotron-CC/data-jsonl/") +def nemotron_cc_download() -> ExecutorStep: + return download_nemotron_v1_step().as_executor_step() + NEMOTRON_DATASETS = { "hq_actual": ["quality=high/kind=actual/**/*.jsonl.*"], @@ -52,8 +53,8 @@ def _get_nemotron_split_paths(split: str): """Helper to get file paths for a nemotron split.""" - patterns = NEMOTRON_DATASETS[split] - return [_nemotron_cc_path / pattern for pattern in patterns] + base = output_path_of(nemotron_cc_download(), "contrib/Nemotron/Nemotron-CC/data-jsonl/") + return [base / pattern for pattern in NEMOTRON_DATASETS[split]] def tokenize_nemotron( diff --git a/experiments/train_test_overlap/train_test_total.py b/experiments/train_test_overlap/train_test_total.py index 92387dd61d..af280c552b 100644 --- a/experiments/train_test_overlap/train_test_total.py +++ b/experiments/train_test_overlap/train_test_total.py @@ -81,7 +81,7 @@ def run_train_test_overlap(config: DeconConfig) -> str: DatasetConfig(name="starcoder", path=downloads["starcoderdata"], text_field="content"), DatasetConfig(name="proofpile", path=downloads["proofpile_2"]), DatasetConfig(name="dolmino", path=dolmino_downloads["dolmino"]), - DatasetConfig(name="nemotron_cc", path=nemotron_cc_download), + DatasetConfig(name="nemotron_cc", path=nemotron_cc_download()), ] From 6ba73d5bdd57c6801b2d95d2c392226960c5e90a Mon Sep 17 00:00:00 2001 From: Rafal Wojdyla Date: Wed, 25 Mar 2026 16:10:29 -0700 Subject: [PATCH 40/56] Fix typo in nemotron_v1 comment Co-Authored-By: Claude Opus 4.6 (1M context) --- lib/marin/src/marin/datakit/download/nemotron_v1.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/marin/src/marin/datakit/download/nemotron_v1.py b/lib/marin/src/marin/datakit/download/nemotron_v1.py index 27a267b38d..3a4f9a0a98 100644 --- a/lib/marin/src/marin/datakit/download/nemotron_v1.py +++ b/lib/marin/src/marin/datakit/download/nemotron_v1.py @@ -114,6 +114,6 @@ def download_nemotron_v1_step() -> StepSpec: return StepSpec( name="raw/nemotron_v1", fn=lambda output_path: download_nemotron_cc(output_path=output_path), - # NOTE: use the existing output to avoid re-downloading. Yes this is mssing the `n`. + # NOTE: use the existing output to avoid re-downloading. Yes this is missing the `n`. override_output_path="raw/nemotro-cc-eeb783", ) From 4b86e369fde6c8bf97ec533d857bf753667b2f61 Mon Sep 17 00:00:00 2001 From: Rafal Wojdyla Date: Wed, 25 Mar 2026 16:17:01 -0700 Subject: [PATCH 41/56] Rename huggingface.py to huggingface_utils.py, update all imports Updates 18 files with import paths and mock targets to reflect the rename from huggingface to huggingface_utils. Co-Authored-By: Claude Opus 4.6 (1M context) --- experiments/common_pile/tokenize_common_pile.py | 2 +- experiments/defaults.py | 2 +- experiments/eval_datasets.py | 2 +- experiments/midtraining_datasets.py | 2 +- experiments/models.py | 2 +- .../download_and_tokenize_fineweb2_hq.py | 2 +- experiments/paloma.py | 2 +- experiments/posttrain/preference_datasets.py | 2 +- experiments/pretraining_datasets/dolma.py | 2 +- experiments/pretraining_datasets/dolmino.py | 2 +- experiments/pretraining_datasets/simple.py | 2 +- .../train_test_overlap/eval_datasets_overlap.py | 2 +- experiments/two_stage/data.py | 2 +- .../download/{huggingface.py => huggingface_utils.py} | 0 lib/marin/src/marin/datakit/download/nemotron_v2.py | 8 +++++++- .../marin/processing/tokenize/download_pretokenized.py | 2 +- lib/marin/src/marin/speedrun/paloma_local_download.py | 4 ++-- tests/datakit/download/test_huggingface.py | 10 +++++----- tests/datakit/test_datakit.py | 2 +- tests/test_hfdataset_spec.py | 2 +- 20 files changed, 30 insertions(+), 24 deletions(-) rename lib/marin/src/marin/datakit/download/{huggingface.py => huggingface_utils.py} (100%) diff --git a/experiments/common_pile/tokenize_common_pile.py b/experiments/common_pile/tokenize_common_pile.py index faee07fc76..1ec5b2f86a 100644 --- a/experiments/common_pile/tokenize_common_pile.py +++ b/experiments/common_pile/tokenize_common_pile.py @@ -5,7 +5,7 @@ from experiments.defaults import default_tokenize from experiments.llama import llama3_tokenizer -from marin.datakit.download.huggingface import DownloadConfig, download_hf +from marin.datakit.download.huggingface_utils import DownloadConfig, download_hf from marin.execution.executor import ExecutorStep, executor_main, this_output_path from marin.processing.tokenize.data_configs import TokenizerStep, lm_mixture_data_config diff --git a/experiments/defaults.py b/experiments/defaults.py index 01e9583442..ef1e9ad892 100644 --- a/experiments/defaults.py +++ b/experiments/defaults.py @@ -46,7 +46,7 @@ from experiments.simple_sft_config import SimpleSFTConfig from experiments.simple_train_config import SimpleTrainConfig from levanter.utils.mesh import MeshConfig -from marin.datakit.download.huggingface import DownloadConfig, download_hf +from marin.datakit.download.huggingface_utils import DownloadConfig, download_hf from marin.evaluation.evaluation_config import EvalTaskConfig from marin.execution.executor import ( ExecutorStep, diff --git a/experiments/eval_datasets.py b/experiments/eval_datasets.py index f55df8b3fc..db6e8f8f54 100644 --- a/experiments/eval_datasets.py +++ b/experiments/eval_datasets.py @@ -3,7 +3,7 @@ import dataclasses -from marin.datakit.download.huggingface import DownloadConfig, download_hf +from marin.datakit.download.huggingface_utils import DownloadConfig, download_hf from marin.execution.executor import ExecutorStep, executor_main, this_output_path, versioned from marin.transform.huggingface.dataset_to_eval import DatasetConversionConfig, OutputFormatOptions, hf_dataset_to_jsonl diff --git a/experiments/midtraining_datasets.py b/experiments/midtraining_datasets.py index 2706f8a4e9..b30e57dc67 100644 --- a/experiments/midtraining_datasets.py +++ b/experiments/midtraining_datasets.py @@ -4,7 +4,7 @@ from experiments.common_pile.tokenize_common_pile import stackv2_edu_filtered from experiments.defaults import default_download, default_tokenize from experiments.llama import llama3_tokenizer -from marin.datakit.download.huggingface import DownloadConfig, download_hf +from marin.datakit.download.huggingface_utils import DownloadConfig, download_hf from marin.execution import versioned from marin.execution.executor import ExecutorStep, this_output_path from marin.processing.tokenize import lm_mixture_data_config diff --git a/experiments/models.py b/experiments/models.py index 972ca4f753..1afb7bb907 100644 --- a/experiments/models.py +++ b/experiments/models.py @@ -18,7 +18,7 @@ from dataclasses import dataclass -from marin.datakit.download.huggingface import DownloadConfig, download_hf +from marin.datakit.download.huggingface_utils import DownloadConfig, download_hf from marin.execution.executor import ExecutorStep, this_output_path, versioned from marin.utils import get_directory_friendly_name diff --git a/experiments/multilingual_fineweb2_hq/download_and_tokenize_fineweb2_hq.py b/experiments/multilingual_fineweb2_hq/download_and_tokenize_fineweb2_hq.py index a3fd2ae82a..db4a6fbb4b 100644 --- a/experiments/multilingual_fineweb2_hq/download_and_tokenize_fineweb2_hq.py +++ b/experiments/multilingual_fineweb2_hq/download_and_tokenize_fineweb2_hq.py @@ -13,7 +13,7 @@ from experiments.llama import llama3_tokenizer from experiments.multilingual_fineweb2_hq.constants import FINEWEB2_DATASETS -from marin.datakit.download.huggingface import DownloadConfig, download_hf +from marin.datakit.download.huggingface_utils import DownloadConfig, download_hf from marin.execution.executor import ExecutorStep, executor_main, output_path_of, this_output_path, versioned from marin.processing.tokenize import TokenizeConfig, tokenize from marin.processing.tokenize.data_configs import TokenizerStep diff --git a/experiments/paloma.py b/experiments/paloma.py index 24c1a536df..e354e31d54 100644 --- a/experiments/paloma.py +++ b/experiments/paloma.py @@ -9,7 +9,7 @@ import os.path -from marin.datakit.download.huggingface import DownloadConfig as HfDownloadConfig, download_hf +from marin.datakit.download.huggingface_utils import DownloadConfig as HfDownloadConfig, download_hf # cyclic dependency # from experiments.llama import llama3_tokenizer diff --git a/experiments/posttrain/preference_datasets.py b/experiments/posttrain/preference_datasets.py index 105722d2af..9ea785000c 100644 --- a/experiments/posttrain/preference_datasets.py +++ b/experiments/posttrain/preference_datasets.py @@ -22,7 +22,7 @@ from collections.abc import Sequence from dataclasses import dataclass, field -from marin.datakit.download.huggingface import DownloadConfig, download_hf +from marin.datakit.download.huggingface_utils import DownloadConfig, download_hf from marin.execution.executor import ( ExecutorStep, executor_main, diff --git a/experiments/pretraining_datasets/dolma.py b/experiments/pretraining_datasets/dolma.py index 02b62df0aa..51604389c0 100644 --- a/experiments/pretraining_datasets/dolma.py +++ b/experiments/pretraining_datasets/dolma.py @@ -10,7 +10,7 @@ import os.path -from marin.datakit.download.huggingface import DownloadConfig, download_hf +from marin.datakit.download.huggingface_utils import DownloadConfig, download_hf from marin.execution.executor import ExecutorStep, this_output_path, versioned, InputName from marin.processing.tokenize import TokenizeConfig, tokenize from marin.processing.tokenize.data_configs import TokenizerStep diff --git a/experiments/pretraining_datasets/dolmino.py b/experiments/pretraining_datasets/dolmino.py index 25dab84f52..0d8eb18a60 100644 --- a/experiments/pretraining_datasets/dolmino.py +++ b/experiments/pretraining_datasets/dolmino.py @@ -5,7 +5,7 @@ import os.path -from marin.datakit.download.huggingface import DownloadConfig, download_hf +from marin.datakit.download.huggingface_utils import DownloadConfig, download_hf from marin.execution.executor import ExecutorStep, this_output_path, versioned from marin.processing.tokenize import TokenizeConfig, tokenize from marin.processing.tokenize.data_configs import TokenizerStep diff --git a/experiments/pretraining_datasets/simple.py b/experiments/pretraining_datasets/simple.py index 5fa9a5fa65..b4ab925bae 100644 --- a/experiments/pretraining_datasets/simple.py +++ b/experiments/pretraining_datasets/simple.py @@ -12,7 +12,7 @@ from levanter.data.text import TextLmDatasetFormat from levanter.store.cache import CacheOptions -from marin.datakit.download.huggingface import download_hf_step +from marin.datakit.download.huggingface_utils import download_hf_step from marin.execution.executor import ExecutorStep, InputName, this_output_path, versioned from marin.processing.tokenize import TokenizeConfig, tokenize diff --git a/experiments/train_test_overlap/eval_datasets_overlap.py b/experiments/train_test_overlap/eval_datasets_overlap.py index b7df8679aa..f547aa3170 100644 --- a/experiments/train_test_overlap/eval_datasets_overlap.py +++ b/experiments/train_test_overlap/eval_datasets_overlap.py @@ -1,7 +1,7 @@ # Copyright The Marin Authors # SPDX-License-Identifier: Apache-2.0 -from marin.datakit.download.huggingface import DownloadConfig, download_hf +from marin.datakit.download.huggingface_utils import DownloadConfig, download_hf from marin.execution.executor import ExecutorStep, executor_main, this_output_path, versioned from marin.transform.huggingface.dataset_to_eval import DatasetConversionConfig, OutputFormatOptions, hf_dataset_to_jsonl diff --git a/experiments/two_stage/data.py b/experiments/two_stage/data.py index c78daf0ab1..3493638097 100644 --- a/experiments/two_stage/data.py +++ b/experiments/two_stage/data.py @@ -6,7 +6,7 @@ from experiments.midtraining_datasets import finemath_3_plus_tokenized from experiments.pretraining_datasets import tokenize_dolma from experiments.pretraining_datasets.simple import tokenized -from marin.datakit.download.huggingface import DownloadConfig, download_hf +from marin.datakit.download.huggingface_utils import DownloadConfig, download_hf from marin.execution.executor import ExecutorStep, this_output_path dolma_components = tokenize_dolma() diff --git a/lib/marin/src/marin/datakit/download/huggingface.py b/lib/marin/src/marin/datakit/download/huggingface_utils.py similarity index 100% rename from lib/marin/src/marin/datakit/download/huggingface.py rename to lib/marin/src/marin/datakit/download/huggingface_utils.py diff --git a/lib/marin/src/marin/datakit/download/nemotron_v2.py b/lib/marin/src/marin/datakit/download/nemotron_v2.py index 4c31f81ffa..24f074a92a 100644 --- a/lib/marin/src/marin/datakit/download/nemotron_v2.py +++ b/lib/marin/src/marin/datakit/download/nemotron_v2.py @@ -12,7 +12,7 @@ from dataclasses import dataclass, field -from marin.datakit.download.huggingface import download_hf_step +from marin.datakit.download.huggingface_utils import download_hf_step from marin.execution.step_spec import StepSpec @@ -24,6 +24,7 @@ class NemotronV2Dataset: revision: str subsets: dict[str, str] = field(default_factory=dict) """Maps subset_name -> glob pattern for parquet files within the download.""" + override_output_path: str | None = None NEMOTRON_V2_DATASETS: dict[str, NemotronV2Dataset] = { @@ -38,6 +39,7 @@ class NemotronV2Dataset: "medium_quality": "Medium-Quality/**/*.parquet", "translated_diverse_qa": "Translated-Diverse-QA/**/*.parquet", }, + override_output_path="raw/nemotron_cc_v2-674913", ), "nemotron_cc_v2_1": NemotronV2Dataset( hf_dataset_id="nvidia/Nemotron-CC-v2.1", @@ -53,11 +55,13 @@ class NemotronV2Dataset: "medium_high_quality_translated": "Medium-High-Quality-Translated-To-English/**/*.parquet", "medium_quality": "Medium-Quality/**/*.parquet", }, + override_output_path="raw/nemotron_cc_v2_1-a7afb6", ), "nemotron_cc_code_v1": NemotronV2Dataset( hf_dataset_id="nvidia/Nemotron-CC-Code-v1", revision="5c5bebc", subsets={"all": "data/**/*.parquet"}, + override_output_path="raw/nemotron_cc_code_v1-c55cd9", ), "nemotron_cc_math_v1": NemotronV2Dataset( hf_dataset_id="nvidia/Nemotron-CC-Math-v1", @@ -67,6 +71,7 @@ class NemotronV2Dataset: "4plus": "4plus/**/*.parquet", "4plus_mind": "4plus_MIND/**/*.parquet", }, + override_output_path="nemotron_cc_math_v1-322fe4", ), "nemotron_pretraining_code_v1": NemotronV2Dataset( hf_dataset_id="nvidia/Nemotron-Pretraining-Code-v1", @@ -75,6 +80,7 @@ class NemotronV2Dataset: "synthetic_code": "Synthetic-Code/**/*.parquet", "code_metadata": "Nemotron-Code-Metadata/**/*.parquet", }, + override_output_path="raw/nemotron_pretraining_code_v1-175b37", ), "nemotron_pretraining_code_v2": NemotronV2Dataset( hf_dataset_id="nvidia/Nemotron-Pretraining-Code-v2", diff --git a/lib/marin/src/marin/processing/tokenize/download_pretokenized.py b/lib/marin/src/marin/processing/tokenize/download_pretokenized.py index cab2433bec..f7a30d4c25 100644 --- a/lib/marin/src/marin/processing/tokenize/download_pretokenized.py +++ b/lib/marin/src/marin/processing/tokenize/download_pretokenized.py @@ -18,7 +18,7 @@ ) from levanter.store.cache import CacheOptions -from marin.datakit.download.huggingface import ( +from marin.datakit.download.huggingface_utils import ( DownloadConfig as HfDownloadConfig, download_hf as hf_download_logic, ) diff --git a/lib/marin/src/marin/speedrun/paloma_local_download.py b/lib/marin/src/marin/speedrun/paloma_local_download.py index e2ee68f766..dd0031481d 100644 --- a/lib/marin/src/marin/speedrun/paloma_local_download.py +++ b/lib/marin/src/marin/speedrun/paloma_local_download.py @@ -8,8 +8,8 @@ """ from experiments.paloma import paloma_tokenized -from marin.datakit.download.huggingface import DownloadConfig as HfDownloadConfig -from marin.datakit.download.huggingface import download_hf +from marin.datakit.download.huggingface_utils import DownloadConfig as HfDownloadConfig +from marin.datakit.download.huggingface_utils import download_hf from marin.execution.executor import ExecutorStep, executor_main, this_output_path, versioned llama3_tokenizer = "meta-llama/Meta-Llama-3.1-8B" diff --git a/tests/datakit/download/test_huggingface.py b/tests/datakit/download/test_huggingface.py index f055cc94ca..4626bd498f 100644 --- a/tests/datakit/download/test_huggingface.py +++ b/tests/datakit/download/test_huggingface.py @@ -9,7 +9,7 @@ import pytest -from marin.datakit.download.huggingface import ( +from marin.datakit.download.huggingface_utils import ( DownloadConfig, _relative_path_in_source, download_hf, @@ -76,7 +76,7 @@ def test_download_hf_basic(mock_hf_fs, tmp_path): ) # Mock HfFileSystem creation - with patch("marin.datakit.download.huggingface.HfFileSystem", return_value=hf_fs): + with patch("marin.datakit.download.huggingface_utils.HfFileSystem", return_value=hf_fs): download_hf(cfg) # Verify files were downloaded @@ -118,7 +118,7 @@ def test_download_hf_appends_sha_when_configured(mock_hf_fs, tmp_path): append_sha_to_path=True, ) - with patch("marin.datakit.download.huggingface.HfFileSystem", return_value=hf_fs): + with patch("marin.datakit.download.huggingface_utils.HfFileSystem", return_value=hf_fs): download_hf(cfg) target_output = base_output_path / revision @@ -180,8 +180,8 @@ def read(self, chunk_size): hf_fs.open.side_effect = lambda path, mode="rb", **_kwargs: FlakyReader() with ( - patch("marin.datakit.download.huggingface.HfFileSystem", return_value=hf_fs), - patch("marin.datakit.download.huggingface.time.sleep", return_value=None), + patch("marin.datakit.download.huggingface_utils.HfFileSystem", return_value=hf_fs), + patch("marin.datakit.download.huggingface_utils.time.sleep", return_value=None), ): result = stream_file_to_fsspec( str(output_path), diff --git a/tests/datakit/test_datakit.py b/tests/datakit/test_datakit.py index 8a9286763b..d2b3578020 100644 --- a/tests/datakit/test_datakit.py +++ b/tests/datakit/test_datakit.py @@ -9,7 +9,7 @@ import pytest from levanter.store.cache import CacheLedger, TreeCache -from marin.datakit.download.huggingface import download_hf_step +from marin.datakit.download.huggingface_utils import download_hf_step from marin.execution.step_runner import StepRunner from marin.execution.step_spec import StepSpec from marin.processing.tokenize.tokenize import TokenizeConfig, tokenize diff --git a/tests/test_hfdataset_spec.py b/tests/test_hfdataset_spec.py index 14ad782471..ef6d2bd264 100644 --- a/tests/test_hfdataset_spec.py +++ b/tests/test_hfdataset_spec.py @@ -2,7 +2,7 @@ # SPDX-License-Identifier: Apache-2.0 from experiments.defaults import default_download, default_tokenize -from marin.datakit.download.huggingface import DownloadConfig +from marin.datakit.download.huggingface_utils import DownloadConfig from marin.processing.tokenize import HfDatasetSpec from marin.processing.tokenize.tokenize import HfTokenizeConfig, TokenizeConfig From e2ac4dec41b2880b8a8d7b8f0f502e16a0cfbf8c Mon Sep 17 00:00:00 2001 From: Rafal Wojdyla Date: Wed, 25 Mar 2026 16:18:05 -0700 Subject: [PATCH 42/56] Add override_output_path to nemotron_v2 datasets Wire override_output_path through NemotronV2Dataset to download_nemotron_v2_step. Fix missing raw/ prefix on nemotron_cc_math_v1. Add overrides for code_v2, specialized_v1, and sft_v1 to pin existing output paths. Co-Authored-By: Claude Opus 4.6 (1M context) --- lib/marin/src/marin/datakit/download/nemotron_v2.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/lib/marin/src/marin/datakit/download/nemotron_v2.py b/lib/marin/src/marin/datakit/download/nemotron_v2.py index 24f074a92a..0e845bd2cd 100644 --- a/lib/marin/src/marin/datakit/download/nemotron_v2.py +++ b/lib/marin/src/marin/datakit/download/nemotron_v2.py @@ -71,7 +71,7 @@ class NemotronV2Dataset: "4plus": "4plus/**/*.parquet", "4plus_mind": "4plus_MIND/**/*.parquet", }, - override_output_path="nemotron_cc_math_v1-322fe4", + override_output_path="raw/nemotron_cc_math_v1-322fe4", ), "nemotron_pretraining_code_v1": NemotronV2Dataset( hf_dataset_id="nvidia/Nemotron-Pretraining-Code-v1", @@ -93,6 +93,7 @@ class NemotronV2Dataset: "synthetic_rewriting": "Synthetic-Rewriting/**/*.parquet", "synthetic_transpilation": "Synthetic-Transpilation/**/*.parquet", }, + override_output_path="raw/nemotron_pretraining_code_v2-d15a24", ), "nemotron_pretraining_specialized_v1": NemotronV2Dataset( hf_dataset_id="nvidia/Nemotron-Pretraining-Specialized-v1", @@ -105,6 +106,7 @@ class NemotronV2Dataset: "rqa": "Nemotron-Pretraining-RQA/**/*.parquet", "infinibyte_reasoning": "Nemotron-Pretraining-InfiniByte-Reasoning/**/*.parquet", }, + override_output_path="raw/nemotron_pretraining_specialized_v1-a31fae", ), "nemotron_pretraining_sft_v1": NemotronV2Dataset( hf_dataset_id="nvidia/Nemotron-Pretraining-SFT-v1", @@ -114,6 +116,7 @@ class NemotronV2Dataset: "sft_general": "Nemotron-SFT-General/**/*.parquet", "sft_math": "Nemotron-SFT-MATH/**/*.parquet", }, + override_output_path="raw/nemotron_pretraining_sft_v1-10f77e", ), } @@ -125,4 +128,5 @@ def download_nemotron_v2_step(family: str) -> StepSpec: f"raw/{family}", hf_dataset_id=info.hf_dataset_id, revision=info.revision, + override_output_path=info.override_output_path, ) From 7c2ab1e9e6a73aa4b9198636c14f86508801600c Mon Sep 17 00:00:00 2001 From: Rafal Wojdyla Date: Wed, 25 Mar 2026 16:24:01 -0700 Subject: [PATCH 43/56] Revert huggingface_utils.py rename back to huggingface.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The _utils suffix was misleading — this is the core HF download module, not a utility helper. Reverts all 19 import paths. Co-Authored-By: Claude Opus 4.6 (1M context) --- experiments/common_pile/tokenize_common_pile.py | 2 +- experiments/defaults.py | 2 +- experiments/eval_datasets.py | 2 +- experiments/midtraining_datasets.py | 2 +- experiments/models.py | 2 +- .../download_and_tokenize_fineweb2_hq.py | 2 +- experiments/paloma.py | 2 +- experiments/posttrain/preference_datasets.py | 2 +- experiments/pretraining_datasets/dolma.py | 2 +- experiments/pretraining_datasets/dolmino.py | 2 +- experiments/pretraining_datasets/simple.py | 2 +- .../train_test_overlap/eval_datasets_overlap.py | 2 +- experiments/two_stage/data.py | 2 +- .../download/{huggingface_utils.py => huggingface.py} | 0 lib/marin/src/marin/datakit/download/nemotron_v2.py | 3 ++- .../marin/processing/tokenize/download_pretokenized.py | 2 +- lib/marin/src/marin/speedrun/paloma_local_download.py | 4 ++-- tests/datakit/download/test_huggingface.py | 10 +++++----- tests/datakit/test_datakit.py | 2 +- tests/test_hfdataset_spec.py | 2 +- 20 files changed, 25 insertions(+), 24 deletions(-) rename lib/marin/src/marin/datakit/download/{huggingface_utils.py => huggingface.py} (100%) diff --git a/experiments/common_pile/tokenize_common_pile.py b/experiments/common_pile/tokenize_common_pile.py index 1ec5b2f86a..faee07fc76 100644 --- a/experiments/common_pile/tokenize_common_pile.py +++ b/experiments/common_pile/tokenize_common_pile.py @@ -5,7 +5,7 @@ from experiments.defaults import default_tokenize from experiments.llama import llama3_tokenizer -from marin.datakit.download.huggingface_utils import DownloadConfig, download_hf +from marin.datakit.download.huggingface import DownloadConfig, download_hf from marin.execution.executor import ExecutorStep, executor_main, this_output_path from marin.processing.tokenize.data_configs import TokenizerStep, lm_mixture_data_config diff --git a/experiments/defaults.py b/experiments/defaults.py index ef1e9ad892..01e9583442 100644 --- a/experiments/defaults.py +++ b/experiments/defaults.py @@ -46,7 +46,7 @@ from experiments.simple_sft_config import SimpleSFTConfig from experiments.simple_train_config import SimpleTrainConfig from levanter.utils.mesh import MeshConfig -from marin.datakit.download.huggingface_utils import DownloadConfig, download_hf +from marin.datakit.download.huggingface import DownloadConfig, download_hf from marin.evaluation.evaluation_config import EvalTaskConfig from marin.execution.executor import ( ExecutorStep, diff --git a/experiments/eval_datasets.py b/experiments/eval_datasets.py index db6e8f8f54..f55df8b3fc 100644 --- a/experiments/eval_datasets.py +++ b/experiments/eval_datasets.py @@ -3,7 +3,7 @@ import dataclasses -from marin.datakit.download.huggingface_utils import DownloadConfig, download_hf +from marin.datakit.download.huggingface import DownloadConfig, download_hf from marin.execution.executor import ExecutorStep, executor_main, this_output_path, versioned from marin.transform.huggingface.dataset_to_eval import DatasetConversionConfig, OutputFormatOptions, hf_dataset_to_jsonl diff --git a/experiments/midtraining_datasets.py b/experiments/midtraining_datasets.py index b30e57dc67..2706f8a4e9 100644 --- a/experiments/midtraining_datasets.py +++ b/experiments/midtraining_datasets.py @@ -4,7 +4,7 @@ from experiments.common_pile.tokenize_common_pile import stackv2_edu_filtered from experiments.defaults import default_download, default_tokenize from experiments.llama import llama3_tokenizer -from marin.datakit.download.huggingface_utils import DownloadConfig, download_hf +from marin.datakit.download.huggingface import DownloadConfig, download_hf from marin.execution import versioned from marin.execution.executor import ExecutorStep, this_output_path from marin.processing.tokenize import lm_mixture_data_config diff --git a/experiments/models.py b/experiments/models.py index 1afb7bb907..972ca4f753 100644 --- a/experiments/models.py +++ b/experiments/models.py @@ -18,7 +18,7 @@ from dataclasses import dataclass -from marin.datakit.download.huggingface_utils import DownloadConfig, download_hf +from marin.datakit.download.huggingface import DownloadConfig, download_hf from marin.execution.executor import ExecutorStep, this_output_path, versioned from marin.utils import get_directory_friendly_name diff --git a/experiments/multilingual_fineweb2_hq/download_and_tokenize_fineweb2_hq.py b/experiments/multilingual_fineweb2_hq/download_and_tokenize_fineweb2_hq.py index db4a6fbb4b..a3fd2ae82a 100644 --- a/experiments/multilingual_fineweb2_hq/download_and_tokenize_fineweb2_hq.py +++ b/experiments/multilingual_fineweb2_hq/download_and_tokenize_fineweb2_hq.py @@ -13,7 +13,7 @@ from experiments.llama import llama3_tokenizer from experiments.multilingual_fineweb2_hq.constants import FINEWEB2_DATASETS -from marin.datakit.download.huggingface_utils import DownloadConfig, download_hf +from marin.datakit.download.huggingface import DownloadConfig, download_hf from marin.execution.executor import ExecutorStep, executor_main, output_path_of, this_output_path, versioned from marin.processing.tokenize import TokenizeConfig, tokenize from marin.processing.tokenize.data_configs import TokenizerStep diff --git a/experiments/paloma.py b/experiments/paloma.py index e354e31d54..24c1a536df 100644 --- a/experiments/paloma.py +++ b/experiments/paloma.py @@ -9,7 +9,7 @@ import os.path -from marin.datakit.download.huggingface_utils import DownloadConfig as HfDownloadConfig, download_hf +from marin.datakit.download.huggingface import DownloadConfig as HfDownloadConfig, download_hf # cyclic dependency # from experiments.llama import llama3_tokenizer diff --git a/experiments/posttrain/preference_datasets.py b/experiments/posttrain/preference_datasets.py index 9ea785000c..105722d2af 100644 --- a/experiments/posttrain/preference_datasets.py +++ b/experiments/posttrain/preference_datasets.py @@ -22,7 +22,7 @@ from collections.abc import Sequence from dataclasses import dataclass, field -from marin.datakit.download.huggingface_utils import DownloadConfig, download_hf +from marin.datakit.download.huggingface import DownloadConfig, download_hf from marin.execution.executor import ( ExecutorStep, executor_main, diff --git a/experiments/pretraining_datasets/dolma.py b/experiments/pretraining_datasets/dolma.py index 51604389c0..02b62df0aa 100644 --- a/experiments/pretraining_datasets/dolma.py +++ b/experiments/pretraining_datasets/dolma.py @@ -10,7 +10,7 @@ import os.path -from marin.datakit.download.huggingface_utils import DownloadConfig, download_hf +from marin.datakit.download.huggingface import DownloadConfig, download_hf from marin.execution.executor import ExecutorStep, this_output_path, versioned, InputName from marin.processing.tokenize import TokenizeConfig, tokenize from marin.processing.tokenize.data_configs import TokenizerStep diff --git a/experiments/pretraining_datasets/dolmino.py b/experiments/pretraining_datasets/dolmino.py index 0d8eb18a60..25dab84f52 100644 --- a/experiments/pretraining_datasets/dolmino.py +++ b/experiments/pretraining_datasets/dolmino.py @@ -5,7 +5,7 @@ import os.path -from marin.datakit.download.huggingface_utils import DownloadConfig, download_hf +from marin.datakit.download.huggingface import DownloadConfig, download_hf from marin.execution.executor import ExecutorStep, this_output_path, versioned from marin.processing.tokenize import TokenizeConfig, tokenize from marin.processing.tokenize.data_configs import TokenizerStep diff --git a/experiments/pretraining_datasets/simple.py b/experiments/pretraining_datasets/simple.py index b4ab925bae..5fa9a5fa65 100644 --- a/experiments/pretraining_datasets/simple.py +++ b/experiments/pretraining_datasets/simple.py @@ -12,7 +12,7 @@ from levanter.data.text import TextLmDatasetFormat from levanter.store.cache import CacheOptions -from marin.datakit.download.huggingface_utils import download_hf_step +from marin.datakit.download.huggingface import download_hf_step from marin.execution.executor import ExecutorStep, InputName, this_output_path, versioned from marin.processing.tokenize import TokenizeConfig, tokenize diff --git a/experiments/train_test_overlap/eval_datasets_overlap.py b/experiments/train_test_overlap/eval_datasets_overlap.py index f547aa3170..b7df8679aa 100644 --- a/experiments/train_test_overlap/eval_datasets_overlap.py +++ b/experiments/train_test_overlap/eval_datasets_overlap.py @@ -1,7 +1,7 @@ # Copyright The Marin Authors # SPDX-License-Identifier: Apache-2.0 -from marin.datakit.download.huggingface_utils import DownloadConfig, download_hf +from marin.datakit.download.huggingface import DownloadConfig, download_hf from marin.execution.executor import ExecutorStep, executor_main, this_output_path, versioned from marin.transform.huggingface.dataset_to_eval import DatasetConversionConfig, OutputFormatOptions, hf_dataset_to_jsonl diff --git a/experiments/two_stage/data.py b/experiments/two_stage/data.py index 3493638097..c78daf0ab1 100644 --- a/experiments/two_stage/data.py +++ b/experiments/two_stage/data.py @@ -6,7 +6,7 @@ from experiments.midtraining_datasets import finemath_3_plus_tokenized from experiments.pretraining_datasets import tokenize_dolma from experiments.pretraining_datasets.simple import tokenized -from marin.datakit.download.huggingface_utils import DownloadConfig, download_hf +from marin.datakit.download.huggingface import DownloadConfig, download_hf from marin.execution.executor import ExecutorStep, this_output_path dolma_components = tokenize_dolma() diff --git a/lib/marin/src/marin/datakit/download/huggingface_utils.py b/lib/marin/src/marin/datakit/download/huggingface.py similarity index 100% rename from lib/marin/src/marin/datakit/download/huggingface_utils.py rename to lib/marin/src/marin/datakit/download/huggingface.py diff --git a/lib/marin/src/marin/datakit/download/nemotron_v2.py b/lib/marin/src/marin/datakit/download/nemotron_v2.py index 0e845bd2cd..91b644730b 100644 --- a/lib/marin/src/marin/datakit/download/nemotron_v2.py +++ b/lib/marin/src/marin/datakit/download/nemotron_v2.py @@ -12,7 +12,7 @@ from dataclasses import dataclass, field -from marin.datakit.download.huggingface_utils import download_hf_step +from marin.datakit.download.huggingface import download_hf_step from marin.execution.step_spec import StepSpec @@ -25,6 +25,7 @@ class NemotronV2Dataset: subsets: dict[str, str] = field(default_factory=dict) """Maps subset_name -> glob pattern for parquet files within the download.""" override_output_path: str | None = None + """Allow to point at existing download output to avoid re-downloading""" NEMOTRON_V2_DATASETS: dict[str, NemotronV2Dataset] = { diff --git a/lib/marin/src/marin/processing/tokenize/download_pretokenized.py b/lib/marin/src/marin/processing/tokenize/download_pretokenized.py index f7a30d4c25..cab2433bec 100644 --- a/lib/marin/src/marin/processing/tokenize/download_pretokenized.py +++ b/lib/marin/src/marin/processing/tokenize/download_pretokenized.py @@ -18,7 +18,7 @@ ) from levanter.store.cache import CacheOptions -from marin.datakit.download.huggingface_utils import ( +from marin.datakit.download.huggingface import ( DownloadConfig as HfDownloadConfig, download_hf as hf_download_logic, ) diff --git a/lib/marin/src/marin/speedrun/paloma_local_download.py b/lib/marin/src/marin/speedrun/paloma_local_download.py index dd0031481d..e2ee68f766 100644 --- a/lib/marin/src/marin/speedrun/paloma_local_download.py +++ b/lib/marin/src/marin/speedrun/paloma_local_download.py @@ -8,8 +8,8 @@ """ from experiments.paloma import paloma_tokenized -from marin.datakit.download.huggingface_utils import DownloadConfig as HfDownloadConfig -from marin.datakit.download.huggingface_utils import download_hf +from marin.datakit.download.huggingface import DownloadConfig as HfDownloadConfig +from marin.datakit.download.huggingface import download_hf from marin.execution.executor import ExecutorStep, executor_main, this_output_path, versioned llama3_tokenizer = "meta-llama/Meta-Llama-3.1-8B" diff --git a/tests/datakit/download/test_huggingface.py b/tests/datakit/download/test_huggingface.py index 4626bd498f..f055cc94ca 100644 --- a/tests/datakit/download/test_huggingface.py +++ b/tests/datakit/download/test_huggingface.py @@ -9,7 +9,7 @@ import pytest -from marin.datakit.download.huggingface_utils import ( +from marin.datakit.download.huggingface import ( DownloadConfig, _relative_path_in_source, download_hf, @@ -76,7 +76,7 @@ def test_download_hf_basic(mock_hf_fs, tmp_path): ) # Mock HfFileSystem creation - with patch("marin.datakit.download.huggingface_utils.HfFileSystem", return_value=hf_fs): + with patch("marin.datakit.download.huggingface.HfFileSystem", return_value=hf_fs): download_hf(cfg) # Verify files were downloaded @@ -118,7 +118,7 @@ def test_download_hf_appends_sha_when_configured(mock_hf_fs, tmp_path): append_sha_to_path=True, ) - with patch("marin.datakit.download.huggingface_utils.HfFileSystem", return_value=hf_fs): + with patch("marin.datakit.download.huggingface.HfFileSystem", return_value=hf_fs): download_hf(cfg) target_output = base_output_path / revision @@ -180,8 +180,8 @@ def read(self, chunk_size): hf_fs.open.side_effect = lambda path, mode="rb", **_kwargs: FlakyReader() with ( - patch("marin.datakit.download.huggingface_utils.HfFileSystem", return_value=hf_fs), - patch("marin.datakit.download.huggingface_utils.time.sleep", return_value=None), + patch("marin.datakit.download.huggingface.HfFileSystem", return_value=hf_fs), + patch("marin.datakit.download.huggingface.time.sleep", return_value=None), ): result = stream_file_to_fsspec( str(output_path), diff --git a/tests/datakit/test_datakit.py b/tests/datakit/test_datakit.py index d2b3578020..8a9286763b 100644 --- a/tests/datakit/test_datakit.py +++ b/tests/datakit/test_datakit.py @@ -9,7 +9,7 @@ import pytest from levanter.store.cache import CacheLedger, TreeCache -from marin.datakit.download.huggingface_utils import download_hf_step +from marin.datakit.download.huggingface import download_hf_step from marin.execution.step_runner import StepRunner from marin.execution.step_spec import StepSpec from marin.processing.tokenize.tokenize import TokenizeConfig, tokenize diff --git a/tests/test_hfdataset_spec.py b/tests/test_hfdataset_spec.py index ef6d2bd264..14ad782471 100644 --- a/tests/test_hfdataset_spec.py +++ b/tests/test_hfdataset_spec.py @@ -2,7 +2,7 @@ # SPDX-License-Identifier: Apache-2.0 from experiments.defaults import default_download, default_tokenize -from marin.datakit.download.huggingface_utils import DownloadConfig +from marin.datakit.download.huggingface import DownloadConfig from marin.processing.tokenize import HfDatasetSpec from marin.processing.tokenize.tokenize import HfTokenizeConfig, TokenizeConfig From 63de1bf4e3fe8216abb7a44abb99c0492290395a Mon Sep 17 00:00:00 2001 From: Rafal Wojdyla Date: Wed, 25 Mar 2026 16:35:04 -0700 Subject: [PATCH 44/56] Delete unused dclm_hq download and transform modules MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit datakit/download/dclm_hq.py (CC HTML fetcher) and transform/dolmino/transform_dclm_hq.py (HTML→text converter) have zero experiment consumers. Removes their test as well. The DCLM mixture config in experiments/pretraining_datasets/dclm.py is unrelated and kept. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../src/marin/datakit/download/dclm_hq.py | 194 ----------------- .../transform/dolmino/transform_dclm_hq.py | 156 -------------- tests/datakit/download/test_dclm_hq.py | 196 ------------------ 3 files changed, 546 deletions(-) delete mode 100644 lib/marin/src/marin/datakit/download/dclm_hq.py delete mode 100644 lib/marin/src/marin/transform/dolmino/transform_dclm_hq.py delete mode 100644 tests/datakit/download/test_dclm_hq.py diff --git a/lib/marin/src/marin/datakit/download/dclm_hq.py b/lib/marin/src/marin/datakit/download/dclm_hq.py deleted file mode 100644 index b473768ec0..0000000000 --- a/lib/marin/src/marin/datakit/download/dclm_hq.py +++ /dev/null @@ -1,194 +0,0 @@ -# Copyright The Marin Authors -# SPDX-License-Identifier: Apache-2.0 - -""" -Download DCLM HQ HTML data by fetching HTML content from Common Crawl. - -Processes DCLM HQ JSONL files and enriches them with HTML content fetched from Common Crawl -via a custom index server. Uses zephyr for parallel processing with flattened parallelism. - -Example Usage: -uv run zephyr --backend=ray --max-parallelism=800 --memory=2GB \ - lib/marin/src/marin/download/dclm_hq/download_dclm_hq_html.py \ - --input_path gs://marin-us-central2/raw/dclm-baseline-1.0-parquet/global/ \ - --output_path gs://marin-data/processed/dclm-hq-html/ -""" - -import io -import json -import logging -import os -import re -from dataclasses import dataclass - -import requests -from iris.marin_fs import open_url -import warcio -from marin.utils import fsspec_glob -from tqdm import tqdm -from zephyr import Dataset, ZephyrContext -from zephyr.writers import ensure_parent_dir - -CC_IDX_HOST_URL = "http://34.72.201.218:8080" -logger = logging.getLogger(__name__) - - -@dataclass -class FileTask: - """Represents a single file processing task.""" - - input_file_path: str - output_file_path: str - - -def fetch_warc_from_cc(s3_warc_path: str, length: int, offset: int) -> str: - """ - Fetch a WARC record from Common Crawl S3 bucket using byte range requests we get - from the CC index via `find_html_in_cc`. - Args: - s3_warc_path: Path to WARC file in S3 bucket - length: Length of the record in bytes - offset: Byte offset of the record in the WARC file - Returns: - The WARC record content as a string - """ - # Convert string values to integers - offset = int(offset) - length = int(length) - - # Make range request to CommonCrawl - response = requests.get( - f"https://data.commoncrawl.org/{s3_warc_path}", headers={"Range": f"bytes={offset}-{offset + length - 1}"} - ) - response.raise_for_status() - - # Parse WARC record and extract HTML content - with io.BytesIO(response.content) as stream: - for record in warcio.ArchiveIterator(stream): - content = record.content_stream().read() - return content.decode(errors="ignore") - - raise ValueError(f"No WARC records found in response from {s3_warc_path}") - - -def find_html_in_cc(split_id: str, target_uri: str) -> str | None: - """ - We host our own index of the Common Crawl over GCP which we use in this function. - For each call we receive a list of chunks that contain the HTML content for the given target URI. - We then fetch each chunk and concatenate them together to form the complete HTML content. - Args: - split_id: The split ID of the Common Crawl - target_uri: The target URI to find the HTML content for - Returns: - The HTML content as a string - """ - resp = requests.get(f"{CC_IDX_HOST_URL}/{split_id}-index?url={target_uri}&output=json") - - resp.raise_for_status() - - chunks = [json.loads(chunk) for chunk in resp.text.split("\n") if chunk] - sorted_chunks = sorted(chunks, key=lambda x: x["offset"]) - - html_content = "" - - for chunk in sorted_chunks: - warc_path = chunk["filename"] - length = chunk["length"] - offset = chunk["offset"] - - warc_record = fetch_warc_from_cc(warc_path, length, offset) - - html_content += warc_record - - return html_content - - -def process_file(task: FileTask) -> None: - """Process a single DCLM file, fetching HTML from Common Crawl. - - Args: - task: FileTask containing input and output file paths - """ - logger.info(f"Starting processing of file {task.input_file_path}") - logger.info(f"Source: {task.input_file_path}") - logger.info(f"Destination: {task.output_file_path}") - try: - ensure_parent_dir(task.output_file_path) - with ( - open_url(task.input_file_path, compression="zstd") as source, - open_url(task.output_file_path, "wt", compression="gzip") as output, - ): - text_wrapper = io.TextIOWrapper(source, encoding="utf-8") - - for line in tqdm(text_wrapper, desc="Processing lines"): - row = json.loads(line.strip()) - - # We need to extract the split from where the record was for querying the index - # The only place we have this information is in the warcinfo key in DCLM HQ - # The format is: - # warc-type: WARC/1.1 - # ... - # isPartOf: CC-MAIN-2024-01 - # This however is a string and not a key-value pair, so we need to extract - # the split from it via regex pattern `isPartOf:\s*(CC-MAIN-\d{4}-\d{2})`. - # This pattern groups the value of the key `isPartOf` that is of the form - # `CC-MAIN-xxxx-xx` where `xxxx` is a year and `xx` is a month. - match = re.search(r"isPartOf:\s*(CC-MAIN-\d{4}-\d{2})", row["metadata"]["warcinfo"]) - if match is None: - logger.error(f"No split found for record ID: {row['metadata']['WARC-Record-ID']}") - continue - - is_part_of = match.group(1) - - try: - html_string = find_html_in_cc(is_part_of, row["metadata"]["WARC-Target-URI"]) - - if html_string is None: - logger.error(f"No HTML found for record ID: {row['metadata']['WARC-Record-ID']}") - continue - - if "text" in row: - row.pop("text") - - row["html"] = html_string - - print(json.dumps(row), file=output) - except Exception as e: - logger.exception(f"Error processing line: {e}") - continue - - logger.info("\nProcessing completed successfully!") - logger.info(f"File available at: {task.output_file_path}") - - except Exception as e: - logger.error(f"Error during processing: {e}") - raise - - -def extract_dclm_hq_dump(input_path: str, output_path: str) -> None: - """Process the DCLM HQ dump and enrich with HTML from Common Crawl.""" - logger.info(f"Starting processing of DCLM HQ dump in {input_path}") - - all_files = [] - paths = [i.split("/")[-1] for i in fsspec_glob(os.path.join(input_path, "*"))] - - logger.info(f"Found {len(paths)} shards to process") - - for path in paths: - shard_input = os.path.join(input_path, path) - shard_paths = fsspec_glob(os.path.join(shard_input, "*.json.zst")) - - for shard_path in shard_paths: - output_file_path = os.path.join(output_path, path, os.path.basename(shard_path)).replace( - ".json.zst", ".jsonl.gz" - ) - all_files.append(FileTask(input_file_path=shard_path, output_file_path=output_file_path)) - - logger.info(f"Found {len(all_files)} files to process") - - pipeline = Dataset.from_list(all_files).map(process_file) - - ctx = ZephyrContext(name="download-dclm-html") - ctx.execute(pipeline) - - logger.info("Processing completed successfully!") diff --git a/lib/marin/src/marin/transform/dolmino/transform_dclm_hq.py b/lib/marin/src/marin/transform/dolmino/transform_dclm_hq.py deleted file mode 100644 index 42f04264bf..0000000000 --- a/lib/marin/src/marin/transform/dolmino/transform_dclm_hq.py +++ /dev/null @@ -1,156 +0,0 @@ -# Copyright The Marin Authors -# SPDX-License-Identifier: Apache-2.0 - -""" -marin/transform/dolmino/transform_dclm_hq.py - -Performs HTML->Text/MD conversion using the specified tools over a DCLM HQ dump save in DOLMA format. - -Example Usage (production, large dataset): -uv run zephyr --backend=ray --max-parallelism=200 --memory=2GB \ - lib/marin/src/marin/transform/dolmino/transform_dclm_hq.py \ - --entry-point=process_dclm_hq_dump \ - --input_hf_path "hf://datasets/allenai/dolmino-mix-1124@main/data/dclm" \ - --output_path gs://bucket/processed/dclm-hq \ - --extract_method resiliparse \ - --extract_config.type resiliparse \ - --hf_repo_id "allenai/dolmino-mix-1124" \ - --hf_revision "main" \ - --hf_paths '["data/dclm"]' - -Example Usage (local testing, small dataset): -uv run zephyr --backend=threadpool --max-parallelism=2 --entry-point=process_dclm_hq_dump \ - lib/marin/src/marin/transform/dolmino/transform_dclm_hq.py \ - --input_hf_path "hf://datasets/allenai/dolmino-mix-1124@main/data/dclm" \ - --output_path /tmp/dclm_hq_test \ - --extract_method trafilatura \ - --extract_config.type trafilatura \ - --extract_config.favor_precision false \ - --extract_config.favor_recall true \ - --hf_repo_id "allenai/dolmino-mix-1124" \ - --hf_revision "main" \ - --hf_paths '["data/dclm"]' \ - --max_split 1 -""" - -import json -import logging -import os -from dataclasses import dataclass - -import draccus -from iris.marin_fs import open_url, url_to_fs -from marin.datakit.download.dclm_hq import find_html_in_cc -from huggingface_hub import HfFileSystem -from marin.schemas.web.convert import ExtractionConfig -from marin.web.convert import convert_page -from tqdm import tqdm -from zephyr import Dataset, ZephyrContext -from zephyr.writers import atomic_rename - -logger = logging.getLogger(__name__) - - -@dataclass -class DCLMHQExtractionConfig: - input_hf_path: str - output_path: str - extract_method: str - extract_config: ExtractionConfig - hf_repo_id: str - hf_revision: str - hf_paths: list[str] - max_split: int | None = None - - -def process_file( - input_file_path: str, - output_file_path: str, - extract_method: str, - extract_config: ExtractionConfig, -) -> None: - logger.info(f"Starting processing of file {input_file_path}") - logger.info(f"Source: {input_file_path}") - logger.info(f"Destination: {output_file_path}") - - with atomic_rename(output_file_path) as temp_path: - with ( - open_url(input_file_path, "rt", compression="zstd") as source, - open_url(temp_path, "wt", compression="gzip") as output, - ): - for line in tqdm(source, desc="Processing lines"): - row = json.loads(line) - - try: - html_string = find_html_in_cc(row["metadata"]["WARC-Record-ID"], row["metadata"]["WARC-Target-URI"]) - - if html_string is None: - logger.error(f"No HTML found for record ID: {row['metadata']['WARC-Record-ID']}") - continue - - content = convert_page(html_string, extract_method=extract_method, config=extract_config)["content"] - - if content is None: - continue - - out_dict = { - "id": row["id"], - "source": row["source"], - "metadata": row["metadata"], - "text": content, - } - - print(json.dumps(out_dict), file=output) # Without this line, the JSON file will be corrupted - except Exception as e: - logger.exception(f"Error processing line: {e}") - continue - - logger.info("\nProcessing completed successfully!") - logger.info(f"File available at: {output_file_path}") - - -@draccus.wrap() -def process_dclm_hq_dump(cfg: DCLMHQExtractionConfig) -> None: - logger.info(f"Starting processing of DCLM HQ dump in {cfg.input_hf_path}") - - # Glob all files across all shards upfront - all_files = [] - hf_fs = HfFileSystem() - paths = [i.split("/")[-1] for i in hf_fs.ls(cfg.input_hf_path, detail=False)] - paths = paths[: cfg.max_split] if cfg.max_split else paths - - logger.info(f"Found {len(paths)} shards to process") - - for path in paths: - input_path = os.path.join(cfg.input_hf_path, path) - shard_paths = [i.split("/")[-1] for i in hf_fs.glob(os.path.join(input_path, "*.json.zst"))] - - for shard_path in shard_paths: - input_file_path = os.path.join(input_path, shard_path) - output_file_path = os.path.join(cfg.output_path, path, shard_path).replace(".json.zst", ".jsonl.gz") - all_files.append( - { - "input": input_file_path, - "output": output_file_path, - "extract_method": cfg.extract_method, - "extract_config": cfg.extract_config, - } - ) - - logger.info(f"Total files to process: {len(all_files)}") - - pipeline = ( - Dataset.from_list(all_files) - .filter(lambda f: not url_to_fs(f["output"])[0].exists(f["output"])) - .map( - lambda f: process_file( - input_file_path=f["input"], - output_file_path=f["output"], - extract_method=f["extract_method"], - extract_config=f["extract_config"], - ) - ) - ) - - ctx = ZephyrContext(name="transform-dclm-hq") - ctx.execute(pipeline) diff --git a/tests/datakit/download/test_dclm_hq.py b/tests/datakit/download/test_dclm_hq.py deleted file mode 100644 index c83b5e03fe..0000000000 --- a/tests/datakit/download/test_dclm_hq.py +++ /dev/null @@ -1,196 +0,0 @@ -# Copyright The Marin Authors -# SPDX-License-Identifier: Apache-2.0 - -"""Tests for DCLM HQ download script that fetches HTML from Common Crawl.""" - -import json -from unittest.mock import patch - -import zstandard as zstd -from marin.datakit.download.dclm_hq import extract_dclm_hq_dump - -SAMPLE_DCLM_RECORDS = [ - { - "id": "test-doc-001", - "source": "common-crawl", - "text": "This is the original text that should be removed.", - "metadata": { - "WARC-Record-ID": "", - "WARC-Target-URI": "http://example.com/test-page", - "warcinfo": ( - "warc-type: WARC/1.1\nWARC-Date: 2024-01-15T10:30:00Z\nisPartOf: CC-MAIN-2024-01\ndescription: Test WARC" - ), - }, - }, - { - "id": "test-doc-002", - "source": "common-crawl", - "text": "This is another original text.", - "metadata": { - "WARC-Record-ID": "", - "WARC-Target-URI": "http://example.com/another-page", - "warcinfo": ( - "warc-type: WARC/1.1\nWARC-Date: 2024-01-15T11:30:00Z\nisPartOf: CC-MAIN-2024-01\ndescription: Test WARC" - ), - }, - }, - { - "id": "test-doc-003", - "source": "common-crawl", - "text": "Third document text.", - "metadata": { - "WARC-Record-ID": "", - "WARC-Target-URI": "http://example.com/third-page", - "warcinfo": ( - "warc-type: WARC/1.1\nWARC-Date: 2024-02-10T09:00:00Z\nisPartOf: CC-MAIN-2024-10\ndescription: Test WARC" - ), - }, - }, -] - -SAMPLE_WARC_HTML = { - "http://example.com/test-page": ( - """ - -Test Page - -

Test Article

-

This is test content from Common Crawl.

- -""" - ), - "http://example.com/another-page": ( - """ - -Another Page - -

Another Article

-

Different content here.

- -""" - ), - "http://example.com/third-page": ( - """ - -Third Page - -

Third Article

-

More content.

- -""" - ), -} - - -def create_warc_bytes(html_content: str) -> bytes: - """Create minimal WARC record bytes for testing.""" - http_response = ( - "HTTP/1.1 200 OK\r\n" - "Content-Type: text/html\r\n" - f"Content-Length: {len(html_content.encode())}\r\n" - "\r\n" - f"{html_content}" - ) - - warc_header = ( - "WARC/1.0\r\n" - "WARC-Type: response\r\n" - "WARC-Record-ID: \r\n" - "WARC-Target-URI: http://example.com/test\r\n" - "Content-Type: application/http; msgtype=response\r\n" - f"Content-Length: {len(http_response.encode())}\r\n" - "\r\n" - ) - - full_warc = warc_header + http_response + "\r\n\r\n" - return full_warc.encode() - - -def create_zstd_compressed_jsonl(records: list[dict]) -> bytes: - """Create zstd compressed JSONL content.""" - jsonl_content = "\n".join(json.dumps(record) for record in records) + "\n" - jsonl_bytes = jsonl_content.encode("utf-8") - cctx = zstd.ZstdCompressor() - return cctx.compress(jsonl_bytes) - - -def test_extract_dclm_hq_pipeline(tmp_path, read_all_jsonl_gz): - """Test full DCLM HQ download pipeline with zephyr integration.""" - output_dir = tmp_path / "output" - output_dir.mkdir() - - # Create input files in nested structure - shard1_dir = tmp_path / "input" / "shard1" - shard2_dir = tmp_path / "input" / "shard2" - shard1_dir.mkdir(parents=True) - shard2_dir.mkdir(parents=True) - - file1_data = create_zstd_compressed_jsonl([SAMPLE_DCLM_RECORDS[0]]) - file2_data = create_zstd_compressed_jsonl(SAMPLE_DCLM_RECORDS[1:]) - - file1_path = shard1_dir / "file1.json.zst" - file2_path = shard2_dir / "file2.json.zst" - - file1_path.write_bytes(file1_data) - file2_path.write_bytes(file2_data) - - def mock_requests_get(url, **kwargs): - from unittest.mock import Mock - - # Mock CC index server responses - if "CC-MAIN-2024-01-index" in url: - response = Mock() - response.status_code = 200 - if "test-page" in url: - response.text = json.dumps({"filename": "test.warc.gz", "offset": "0", "length": "1000"}) - else: # another-page - response.text = json.dumps({"filename": "test2.warc.gz", "offset": "0", "length": "1000"}) - response.raise_for_status = Mock() - return response - elif "CC-MAIN-2024-10-index" in url: - response = Mock() - response.status_code = 200 - response.text = json.dumps({"filename": "test3.warc.gz", "offset": "0", "length": "1000"}) - response.raise_for_status = Mock() - return response - # Mock Common Crawl WARC fetches - elif "data.commoncrawl.org" in url: - response = Mock() - response.status_code = 200 - # Determine which HTML to return based on the WARC file - if "test.warc.gz" in url: - html_content = SAMPLE_WARC_HTML["http://example.com/test-page"] - elif "test2.warc.gz" in url: - html_content = SAMPLE_WARC_HTML["http://example.com/another-page"] - else: # test3.warc.gz - html_content = SAMPLE_WARC_HTML["http://example.com/third-page"] - response.content = create_warc_bytes(html_content) - response.raise_for_status = Mock() - return response - - raise ValueError(f"Unexpected URL: {url}") - - with patch("marin.datakit.download.dclm_hq.requests.get", side_effect=mock_requests_get): - extract_dclm_hq_dump(str(tmp_path / "input"), str(output_dir)) - - # Verify output files were created in nested structure - shard1_output = output_dir / "shard1" - shard2_output = output_dir / "shard2" - - assert shard1_output.exists() - assert shard2_output.exists() - - # Read all records - all_records = [] - all_records.extend(read_all_jsonl_gz(shard1_output, "*.jsonl.gz")) - all_records.extend(read_all_jsonl_gz(shard2_output, "*.jsonl.gz")) - - assert len(all_records) == 3 - - # Verify records have HTML and no text - for record in all_records: - assert "id" in record - assert "html" in record - assert "text" not in record - assert "metadata" in record - assert len(record["html"]) > 0 From 38f64741f8c0d7d837040a73b73db99856d7f3e1 Mon Sep 17 00:00:00 2001 From: Rafal Wojdyla Date: Wed, 25 Mar 2026 16:57:41 -0700 Subject: [PATCH 45/56] Extract dolmino download into datakit/download/dolmino.py Moves the download definition and DOLMINO_DATASETS split metadata into a datakit module. The experiment file now imports from there and only handles tokenization wiring. Co-Authored-By: Claude Opus 4.6 (1M context) --- experiments/pretraining_datasets/dolmino.py | 40 +++---------------- .../src/marin/datakit/download/dolmino.py | 32 +++++++++++++++ 2 files changed, 37 insertions(+), 35 deletions(-) create mode 100644 lib/marin/src/marin/datakit/download/dolmino.py diff --git a/experiments/pretraining_datasets/dolmino.py b/experiments/pretraining_datasets/dolmino.py index 25dab84f52..8126ded701 100644 --- a/experiments/pretraining_datasets/dolmino.py +++ b/experiments/pretraining_datasets/dolmino.py @@ -5,46 +5,16 @@ import os.path -from marin.datakit.download.huggingface import DownloadConfig, download_hf +from marin.datakit.download.dolmino import DOLMINO_DATASETS, download_dolmino_step from marin.execution.executor import ExecutorStep, this_output_path, versioned from marin.processing.tokenize import TokenizeConfig, tokenize from marin.processing.tokenize.data_configs import TokenizerStep -# Raw dataset download step -downloads = { - "dolmino": ( - ExecutorStep( - name="raw/dolmino-mix-1124", - fn=download_hf, - config=DownloadConfig( - hf_dataset_id="allenai/dolmino-mix-1124", - revision="bb54cab", - gcs_output_path=this_output_path(), - wait_for_completion=True, - ), - ) - .with_output_path("raw/dolmino-mix-1124-157960") - .cd("bb54cab") - ) -} +_dolmino_download = download_dolmino_step().as_executor_step() +_dolmino_base_dir = _dolmino_download.cd("bb54cab").cd("data") -_dolmino_base_dir = downloads["dolmino"].cd("data") - -# The following dataset splits define file patterns for each split. -DOLMINO_DATASETS = { - "dclm": ["**/*.json.zst"], - "flan": ["**/*.json.gz"], - "math/codesearchnet-owmfilter": ["**/*.jsonl.gz"], - "math/dolmino_math_synth": ["**/*.jsonl"], - "math/gsm8k": ["**/*.jsonl.zst"], - "math/mathcoder2-synthmath": ["**/*.jsonl"], - "math/metamath-owmfilter": ["**/*.jsonl.gz"], - "math/tinyGSM-MIND": ["**/*.jsonl.gz"], - "math/tulu_math": ["**/*.jsonl"], - "pes2o": ["**/*.json.gz"], - "stackexchange": ["**/*.json.gz"], - "wiki": ["**/*.json.gz"], -} +# Backward compat — some consumers import this +downloads = {"dolmino": _dolmino_download.cd("bb54cab")} # NB: we changed how hashes were computed for this corpus and we'd like to avoid recomputing them DOLMINO_LLAMA3_OVERRIDES = { diff --git a/lib/marin/src/marin/datakit/download/dolmino.py b/lib/marin/src/marin/datakit/download/dolmino.py new file mode 100644 index 0000000000..0e1b063cf2 --- /dev/null +++ b/lib/marin/src/marin/datakit/download/dolmino.py @@ -0,0 +1,32 @@ +# Copyright The Marin Authors +# SPDX-License-Identifier: Apache-2.0 + +"""Dolmino dataset download definition and split metadata.""" + +from marin.datakit.download.huggingface import download_hf_step +from marin.execution.step_spec import StepSpec + +DOLMINO_DATASETS = { + "dclm": ["**/*.json.zst"], + "flan": ["**/*.json.gz"], + "math/codesearchnet-owmfilter": ["**/*.jsonl.gz"], + "math/dolmino_math_synth": ["**/*.jsonl"], + "math/gsm8k": ["**/*.jsonl.zst"], + "math/mathcoder2-synthmath": ["**/*.jsonl"], + "math/metamath-owmfilter": ["**/*.jsonl.gz"], + "math/tinyGSM-MIND": ["**/*.jsonl.gz"], + "math/tulu_math": ["**/*.jsonl"], + "pes2o": ["**/*.json.gz"], + "stackexchange": ["**/*.json.gz"], + "wiki": ["**/*.json.gz"], +} + + +def download_dolmino_step() -> StepSpec: + """Download the dolmino-mix-1124 dataset from HuggingFace.""" + return download_hf_step( + "raw/dolmino-mix-1124", + hf_dataset_id="allenai/dolmino-mix-1124", + revision="bb54cab", + override_output_path="raw/dolmino-mix-1124-157960", + ) From 83a01f804f1102ea9237c188c1bdaaabc4585d0a Mon Sep 17 00:00:00 2001 From: Rafal Wojdyla Date: Wed, 25 Mar 2026 17:20:01 -0700 Subject: [PATCH 46/56] Extract dolma download into datakit/download/dolma.py Moves download_dolma_step(), DOLMA_DATASETS, and DOLMA_OLMO_MIXTURE_WEIGHTS into a datakit module. The experiment file now imports from there and only handles tokenization wiring. Co-Authored-By: Claude Opus 4.6 (1M context) --- experiments/pretraining_datasets/__init__.py | 3 +- experiments/pretraining_datasets/dolma.py | 73 ++----------------- lib/marin/src/marin/datakit/download/dolma.py | 60 +++++++++++++++ 3 files changed, 67 insertions(+), 69 deletions(-) create mode 100644 lib/marin/src/marin/datakit/download/dolma.py diff --git a/experiments/pretraining_datasets/__init__.py b/experiments/pretraining_datasets/__init__.py index 93e9ffddf0..571c4483cc 100644 --- a/experiments/pretraining_datasets/__init__.py +++ b/experiments/pretraining_datasets/__init__.py @@ -19,12 +19,11 @@ # Import downloads and tokenized dicts from each module from experiments.pretraining_datasets.dolma import ( - DOLMA_DATASETS, DOLMA_LLAMA3_OVERRIDES, - DOLMA_OLMO_MIXTURE_WEIGHTS, downloads as dolma_downloads, tokenize_dolma, ) +from marin.datakit.download.dolma import DOLMA_DATASETS, DOLMA_OLMO_MIXTURE_WEIGHTS from experiments.pretraining_datasets.dolmino import ( DOLMINO_DATASETS, DOLMINO_LLAMA3_OVERRIDES, diff --git a/experiments/pretraining_datasets/dolma.py b/experiments/pretraining_datasets/dolma.py index 02b62df0aa..97d840d79d 100644 --- a/experiments/pretraining_datasets/dolma.py +++ b/experiments/pretraining_datasets/dolma.py @@ -1,83 +1,23 @@ # Copyright The Marin Authors # SPDX-License-Identifier: Apache-2.0 -""" -DOLMA 1.7 dataset definitions and tokenization. - -This module defines the raw DOLMA dataset download and tokenization -logic for all 15 splits. -""" +"""DOLMA 1.7 dataset definitions and tokenization.""" import os.path -from marin.datakit.download.huggingface import DownloadConfig, download_hf -from marin.execution.executor import ExecutorStep, this_output_path, versioned, InputName +from marin.datakit.download.dolma import DOLMA_DATASETS, download_dolma_step +from marin.execution.executor import ExecutorStep, InputName, this_output_path, versioned from marin.processing.tokenize import TokenizeConfig, tokenize from marin.processing.tokenize.data_configs import TokenizerStep -# Raw dataset download step -downloads = { - "dolma": ExecutorStep( - name="raw/dolma", - fn=download_hf, - config=DownloadConfig( - hf_dataset_id="allenai/dolma", - revision="7f48140", - gcs_output_path=this_output_path(), - wait_for_completion=True, - ), - override_output_path="raw/dolma", - ) -} +_dolma_download = download_dolma_step().as_executor_step() +# Backward compat — some consumers import this +downloads = {"dolma": _dolma_download} # For dolma 1.7, we hardcode the path since it was added before versioning _DOLMA_V1_7_PATH = InputName.hardcoded("raw/dolma/v1.7") - -# Sampling proportion comes from https://huggingface.co/datasets/allenai/dolma -DOLMA_OLMO_MIXTURE_WEIGHTS = { - "dolma/algebraic-stack": 12.6, # 12.6 * 1.0 - "dolma/arxiv": 28.0, # 28.0 * 1.0 - "dolma/gutenberg": 5.3, # 5.3 * 1.0 - "dolma/c4": 124.95, # 249.9 * 0.5 - "dolma/cc": 597.75, # 1,195.5 * 0.5 - "dolma/cc-news": 14.3, # 1.0 - "dolma/falcon": 456.4, # 1.0, refined web - "dolma/megawika": 4.6, # 1.0 - "dolma/open-web-math": 12.6, # 1.0 - "dolma/pes2o": 57.2, # 1.0 - "dolma/reddit": 79.9, # 1.0 - "dolma/stackexchange": 19.6, # 1.0 - "dolma/starcoder": 263.8, # 1.0 - "dolma/flan": 16.5, # 6.5 * 1.0 - "dolma/wiki": 7.4, # 3.7 * 2.0 -} - -DOLMA_DATASETS = { - "algebraic-stack": ["algebraic-stack-train-{0000..0015}.json.gz"], - "arxiv": ["arxiv-{0000..0099}.json.gz"], - "gutenberg": ["books-{0000..0002}.json.gz"], - "c4": ["c4-{0000..0170}.json.gz"], - "cc": [ - "cc_en_head-{0000..0274}.json.gz", - "cc_en_middle-{0000..0238}.json.gz", - "cc_en_middle-{0240..0379}.json.gz", - "cc_en_tail-{0000..0152}.json.gz", - "cc_en_tail-{0154..0444}.json.gz", - ], - "cc-news": ["cc_news_head-{0000..0004}.json.gz", "cc_news_middle-{0000..0002}.json.gz", "cc_news_tail-0000.json.gz"], - "falcon": ["falcon-{0000..0499}.json.gz"], - "megawika": ["megawika-{0000..0261}.json.gz"], - "open-web-math": ["open-web-math-train-{0000..0012}.json.gz"], - "pes2o": ["pes2o-{0000..0025}.json.gz"], - "reddit": ["reddit-{0000..0077}.json.gz"], - "stackexchange": ["stackexchange-{0000..0025}.json.gz"], - "starcoder": ["starcoder-{0000..0048}.json.gz"], - "flan": ["tulu_flan-{0000..0065}.json.gz"], - "wiki": ["wiki-{0000..0001}.json.gz"], -} - # NB: we changed how hashes were computed for this corpus and we'd like to avoid recomputing them DOLMA_LLAMA3_OVERRIDES = { "c4": "tokenized/dolma/c4-e0e5ec", @@ -118,7 +58,6 @@ def tokenize_dolma(*, tokenizer: str | None = None) -> dict[str, TokenizerStep]: ), ) - # Check if we need to use override path for llama3 if tokenizer == llama3_tokenizer and dataset in DOLMA_LLAMA3_OVERRIDES: step = step.with_output_path(DOLMA_LLAMA3_OVERRIDES[dataset]) dolma_steps[os.path.join("dolma", dataset)] = step diff --git a/lib/marin/src/marin/datakit/download/dolma.py b/lib/marin/src/marin/datakit/download/dolma.py new file mode 100644 index 0000000000..b6849d4354 --- /dev/null +++ b/lib/marin/src/marin/datakit/download/dolma.py @@ -0,0 +1,60 @@ +# Copyright The Marin Authors +# SPDX-License-Identifier: Apache-2.0 + +"""Dolma 1.7 dataset download definition and split metadata.""" + +from marin.datakit.download.huggingface import download_hf_step +from marin.execution.step_spec import StepSpec + +DOLMA_DATASETS = { + "algebraic-stack": ["algebraic-stack-train-{0000..0015}.json.gz"], + "arxiv": ["arxiv-{0000..0099}.json.gz"], + "gutenberg": ["books-{0000..0002}.json.gz"], + "c4": ["c4-{0000..0170}.json.gz"], + "cc": [ + "cc_en_head-{0000..0274}.json.gz", + "cc_en_middle-{0000..0238}.json.gz", + "cc_en_middle-{0240..0379}.json.gz", + "cc_en_tail-{0000..0152}.json.gz", + "cc_en_tail-{0154..0444}.json.gz", + ], + "cc-news": ["cc_news_head-{0000..0004}.json.gz", "cc_news_middle-{0000..0002}.json.gz", "cc_news_tail-0000.json.gz"], + "falcon": ["falcon-{0000..0499}.json.gz"], + "megawika": ["megawika-{0000..0261}.json.gz"], + "open-web-math": ["open-web-math-train-{0000..0012}.json.gz"], + "pes2o": ["pes2o-{0000..0025}.json.gz"], + "reddit": ["reddit-{0000..0077}.json.gz"], + "stackexchange": ["stackexchange-{0000..0025}.json.gz"], + "starcoder": ["starcoder-{0000..0048}.json.gz"], + "flan": ["tulu_flan-{0000..0065}.json.gz"], + "wiki": ["wiki-{0000..0001}.json.gz"], +} + +# Sampling proportion comes from https://huggingface.co/datasets/allenai/dolma +DOLMA_OLMO_MIXTURE_WEIGHTS = { + "dolma/algebraic-stack": 12.6, + "dolma/arxiv": 28.0, + "dolma/gutenberg": 5.3, + "dolma/c4": 124.95, + "dolma/cc": 597.75, + "dolma/cc-news": 14.3, + "dolma/falcon": 456.4, + "dolma/megawika": 4.6, + "dolma/open-web-math": 12.6, + "dolma/pes2o": 57.2, + "dolma/reddit": 79.9, + "dolma/stackexchange": 19.6, + "dolma/starcoder": 263.8, + "dolma/flan": 16.5, + "dolma/wiki": 7.4, +} + + +def download_dolma_step() -> StepSpec: + """Download the Dolma 1.7 dataset from HuggingFace.""" + return download_hf_step( + "raw/dolma", + hf_dataset_id="allenai/dolma", + revision="7f48140", + override_output_path="raw/dolma", + ) From 76b3ae86b50f18771907cd20b16e500c0cecd620 Mon Sep 17 00:00:00 2001 From: Rafal Wojdyla Date: Wed, 25 Mar 2026 17:25:08 -0700 Subject: [PATCH 47/56] Move DOLMA_OLMO_MIXTURE_WEIGHTS back to experiment file Mixture weights are experiment config, not download metadata. Co-Authored-By: Claude Opus 4.6 (1M context) --- experiments/pretraining_datasets/__init__.py | 3 ++- experiments/pretraining_datasets/dolma.py | 19 +++++++++++++++++++ lib/marin/src/marin/datakit/download/dolma.py | 19 ------------------- 3 files changed, 21 insertions(+), 20 deletions(-) diff --git a/experiments/pretraining_datasets/__init__.py b/experiments/pretraining_datasets/__init__.py index 571c4483cc..2e0f6cc004 100644 --- a/experiments/pretraining_datasets/__init__.py +++ b/experiments/pretraining_datasets/__init__.py @@ -20,10 +20,11 @@ # Import downloads and tokenized dicts from each module from experiments.pretraining_datasets.dolma import ( DOLMA_LLAMA3_OVERRIDES, + DOLMA_OLMO_MIXTURE_WEIGHTS, downloads as dolma_downloads, tokenize_dolma, ) -from marin.datakit.download.dolma import DOLMA_DATASETS, DOLMA_OLMO_MIXTURE_WEIGHTS +from marin.datakit.download.dolma import DOLMA_DATASETS from experiments.pretraining_datasets.dolmino import ( DOLMINO_DATASETS, DOLMINO_LLAMA3_OVERRIDES, diff --git a/experiments/pretraining_datasets/dolma.py b/experiments/pretraining_datasets/dolma.py index 97d840d79d..256ea0b58e 100644 --- a/experiments/pretraining_datasets/dolma.py +++ b/experiments/pretraining_datasets/dolma.py @@ -15,6 +15,25 @@ # Backward compat — some consumers import this downloads = {"dolma": _dolma_download} +# Sampling proportion comes from https://huggingface.co/datasets/allenai/dolma +DOLMA_OLMO_MIXTURE_WEIGHTS = { + "dolma/algebraic-stack": 12.6, + "dolma/arxiv": 28.0, + "dolma/gutenberg": 5.3, + "dolma/c4": 124.95, + "dolma/cc": 597.75, + "dolma/cc-news": 14.3, + "dolma/falcon": 456.4, + "dolma/megawika": 4.6, + "dolma/open-web-math": 12.6, + "dolma/pes2o": 57.2, + "dolma/reddit": 79.9, + "dolma/stackexchange": 19.6, + "dolma/starcoder": 263.8, + "dolma/flan": 16.5, + "dolma/wiki": 7.4, +} + # For dolma 1.7, we hardcode the path since it was added before versioning _DOLMA_V1_7_PATH = InputName.hardcoded("raw/dolma/v1.7") diff --git a/lib/marin/src/marin/datakit/download/dolma.py b/lib/marin/src/marin/datakit/download/dolma.py index b6849d4354..7e9ac26e3b 100644 --- a/lib/marin/src/marin/datakit/download/dolma.py +++ b/lib/marin/src/marin/datakit/download/dolma.py @@ -30,25 +30,6 @@ "wiki": ["wiki-{0000..0001}.json.gz"], } -# Sampling proportion comes from https://huggingface.co/datasets/allenai/dolma -DOLMA_OLMO_MIXTURE_WEIGHTS = { - "dolma/algebraic-stack": 12.6, - "dolma/arxiv": 28.0, - "dolma/gutenberg": 5.3, - "dolma/c4": 124.95, - "dolma/cc": 597.75, - "dolma/cc-news": 14.3, - "dolma/falcon": 456.4, - "dolma/megawika": 4.6, - "dolma/open-web-math": 12.6, - "dolma/pes2o": 57.2, - "dolma/reddit": 79.9, - "dolma/stackexchange": 19.6, - "dolma/starcoder": 263.8, - "dolma/flan": 16.5, - "dolma/wiki": 7.4, -} - def download_dolma_step() -> StepSpec: """Download the Dolma 1.7 dataset from HuggingFace.""" From 213d250305f639fe33816d82e715277eba4b2427 Mon Sep 17 00:00:00 2001 From: Rafal Wojdyla Date: Wed, 25 Mar 2026 17:31:15 -0700 Subject: [PATCH 48/56] Add download_wikipedia_step with override pointing at existing data Defaults to the enwiki 20241201 dump with override_output_path pointing at raw/wikipedia-a7dad0 where the data already lives. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../src/marin/datakit/download/wikipedia.py | 29 +++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/lib/marin/src/marin/datakit/download/wikipedia.py b/lib/marin/src/marin/datakit/download/wikipedia.py index ec51c62b0b..1de08ba9fa 100644 --- a/lib/marin/src/marin/datakit/download/wikipedia.py +++ b/lib/marin/src/marin/datakit/download/wikipedia.py @@ -31,6 +31,7 @@ import requests from iris.marin_fs import open_url +from marin.execution.step_spec import StepSpec from marin.utils import fsspec_size from tqdm_loggable.auto import tqdm from zephyr import Dataset, ZephyrContext, atomic_rename, load_jsonl @@ -111,3 +112,31 @@ def download_wikipedia(input_urls: list[str], revision: str, output_path: str) - ) logger.info("Wikipedia dump transfer complete, wrote: %s", list(extracted)) + + +ENWIKI_20241201_URL = ( + "https://dumps.wikimedia.org/other/enterprise_html/runs/20241201/" "enwiki-NS0-20241201-ENTERPRISE-HTML.json.tar.gz" +) + + +def download_wikipedia_step( + *, + input_urls: list[str] | None = None, + revision: str = "20241201", +) -> StepSpec: + """Download Wikipedia HTML dumps. + + Defaults to the English Wikipedia 20241201 dump which is already + downloaded at ``raw/wikipedia-a7dad0``. + """ + urls = input_urls or [ENWIKI_20241201_URL] + + def _run(output_path: str) -> None: + download_wikipedia(urls, revision, output_path) + + return StepSpec( + name="raw/wikipedia", + fn=_run, + hash_attrs={"input_urls": urls, "revision": revision}, + override_output_path="raw/wikipedia-a7dad0", + ) From 80b8a237ca3282ccc23eb89102e047299c901328 Mon Sep 17 00:00:00 2001 From: Rafal Wojdyla Date: Wed, 25 Mar 2026 18:42:47 -0700 Subject: [PATCH 49/56] Simplify download_wikipedia_step and remove revision param Drop the revision nesting from download_wikipedia. The step uses override_output_path to point at existing data when no input_urls are provided. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../src/marin/datakit/download/wikipedia.py | 32 +++++++------------ 1 file changed, 11 insertions(+), 21 deletions(-) diff --git a/lib/marin/src/marin/datakit/download/wikipedia.py b/lib/marin/src/marin/datakit/download/wikipedia.py index 1de08ba9fa..c2ccbdbe02 100644 --- a/lib/marin/src/marin/datakit/download/wikipedia.py +++ b/lib/marin/src/marin/datakit/download/wikipedia.py @@ -90,16 +90,15 @@ def process_file(input_file: str, output_path: str) -> Iterable[str]: raise e -def download_wikipedia(input_urls: list[str], revision: str, output_path: str) -> None: +def download_wikipedia(input_urls: list[str], output_path: str) -> None: """Download and process Wikipedia data.""" logger.info("Starting transfer of Wikipedia dump...") - output_base = os.path.join(output_path, revision) ctx = ZephyrContext(name="download-wikipedia") download_metrics = ctx.execute( Dataset.from_list(input_urls) - .map(lambda url: download_tar(url, output_base)) - .write_jsonl(f"{output_base}/.metrics/download-{{shard:05d}}.jsonl", skip_existing=True), + .map(lambda url: download_tar(url, output_path)) + .write_jsonl(f"{output_path}/.metrics/download-{{shard:05d}}.jsonl", skip_existing=True), ) # load all of the output filenames to process @@ -107,36 +106,27 @@ def download_wikipedia(input_urls: list[str], revision: str, output_path: str) - extracted = ctx.execute( Dataset.from_list(downloads) - .flat_map(lambda file: process_file(file, output_base)) - .write_jsonl(f"{output_base}/.metrics/process-{{shard:05d}}.jsonl", skip_existing=True), + .flat_map(lambda file: process_file(file, output_path)) + .write_jsonl(f"{output_path}/.metrics/process-{{shard:05d}}.jsonl", skip_existing=True), ) logger.info("Wikipedia dump transfer complete, wrote: %s", list(extracted)) -ENWIKI_20241201_URL = ( - "https://dumps.wikimedia.org/other/enterprise_html/runs/20241201/" "enwiki-NS0-20241201-ENTERPRISE-HTML.json.tar.gz" -) - - def download_wikipedia_step( *, input_urls: list[str] | None = None, - revision: str = "20241201", ) -> StepSpec: - """Download Wikipedia HTML dumps. - - Defaults to the English Wikipedia 20241201 dump which is already - downloaded at ``raw/wikipedia-a7dad0``. - """ - urls = input_urls or [ENWIKI_20241201_URL] + """Download Wikipedia HTML dumps""" def _run(output_path: str) -> None: - download_wikipedia(urls, revision, output_path) + assert input_urls is not None, "input_urls must be provided to download Wikipedia data" + download_wikipedia(input_urls, output_path) return StepSpec( name="raw/wikipedia", fn=_run, - hash_attrs={"input_urls": urls, "revision": revision}, - override_output_path="raw/wikipedia-a7dad0", + hash_attrs={"input_urls": input_urls}, + # NOTE: if no inputs are provided, use the previously downloaded and no longer exposed 2024-12-01 data + override_output_path="raw/wikipedia-9273e1" if input_urls is None else None, ) From 3f85e10c9224fa49443782d83c60ee5c819fcd6d Mon Sep 17 00:00:00 2001 From: Rafal Wojdyla Date: Wed, 25 Mar 2026 18:47:06 -0700 Subject: [PATCH 50/56] Wire download_wikipedia_step into exp934 as StepSpec dependency The Wikipedia transform step is now a StepSpec with the download step as a dep, replacing the hardcoded mirrored() path. Converted to .as_executor_step().cd("20241201") for backward compat with downstream consumers. Co-Authored-By: Claude Opus 4.6 (1M context) --- experiments/exp934_hq_vs_pt.py | 39 +++++++++++-------- .../src/marin/datakit/download/wikipedia.py | 2 +- 2 files changed, 23 insertions(+), 18 deletions(-) diff --git a/experiments/exp934_hq_vs_pt.py b/experiments/exp934_hq_vs_pt.py index 56a385cb32..fa375a30e8 100644 --- a/experiments/exp934_hq_vs_pt.py +++ b/experiments/exp934_hq_vs_pt.py @@ -8,7 +8,9 @@ datasets used by various training experiments. """ +from marin.datakit.download.wikipedia import download_wikipedia_step from marin.execution.executor import ExecutorStep, mirrored, this_output_path, versioned +from marin.execution.step_spec import StepSpec from marin.schemas.web.convert import HtmlToMarkdownConfig, ResiliparseConfig from marin.schemas.web.selectors import ARXIV_BLACKLISTED_SELECTORS, WIKI_BLACKLISTED_SELECTORS from marin.transform.ar5iv.transform_ar5iv import Ar5ivExtractionConfig, process_ar5iv_dump @@ -42,30 +44,33 @@ ), ).with_output_path("documents/stackexchange-resiliparse-custom-fork-ab41ad") -# Wikipedia resiliparse custom fork step (data already exists at hardcoded path) -wikipedia_resiliparse_custom_fork = ( - ExecutorStep( - name="documents/wikipedia-resiliparse-custom-fork", - fn=process_wiki_dump, - config=WikiExtractionConfig( - input_path=mirrored("raw/wikipedia-a7dad0/20241201", budget_gb=1), - revision=versioned("20241201"), - output_path=this_output_path(), +_wikipedia_download = download_wikipedia_step() + +# Wikipedia resiliparse custom fork step +_wikipedia_transform = StepSpec( + name="documents/wikipedia-resiliparse-custom-fork", + fn=lambda output_path: process_wiki_dump( + WikiExtractionConfig( + input_path=f"{_wikipedia_download.output_path}/20241201", + revision="20241201", + output_path=output_path, extract_method="resiliparse", extract_config=ResiliparseConfig( links=False, skip_elements=WIKI_BLACKLISTED_SELECTORS, markdownify_config=HtmlToMarkdownConfig(include_images=False, include_links=False), ), - remove_reference_section=versioned(True), - digit_threshold=versioned(50), - word_threshold=versioned(70), - special_char_threshold=versioned(50), - ), - ) - .with_output_path("documents/wikipedia-resiliparse-custom-fork-2569de") - .cd("20241201") + remove_reference_section=True, + digit_threshold=50, + word_threshold=70, + special_char_threshold=50, + ) + ), + deps=[_wikipedia_download], + hash_attrs={"revision": "20241201", "extract_method": "resiliparse"}, + override_output_path="documents/wikipedia-resiliparse-custom-fork-2569de", ) +wikipedia_resiliparse_custom_fork = _wikipedia_transform.as_executor_step().cd("20241201") # ar5iv resiliparse custom fork step (data already exists at hardcoded path) ar5iv_no_problem_resiliparse_custom_fork = ExecutorStep( diff --git a/lib/marin/src/marin/datakit/download/wikipedia.py b/lib/marin/src/marin/datakit/download/wikipedia.py index c2ccbdbe02..683c5735aa 100644 --- a/lib/marin/src/marin/datakit/download/wikipedia.py +++ b/lib/marin/src/marin/datakit/download/wikipedia.py @@ -128,5 +128,5 @@ def _run(output_path: str) -> None: fn=_run, hash_attrs={"input_urls": input_urls}, # NOTE: if no inputs are provided, use the previously downloaded and no longer exposed 2024-12-01 data - override_output_path="raw/wikipedia-9273e1" if input_urls is None else None, + override_output_path="raw/wikipedia-a7dad0" if input_urls is None else None, ) From 5032fc6678020ded199af375b8ed051ee8e7d699 Mon Sep 17 00:00:00 2001 From: Rafal Wojdyla Date: Wed, 25 Mar 2026 18:48:59 -0700 Subject: [PATCH 51/56] Fix Wikipedia download override path to wikipedia-9273e1 Co-Authored-By: Claude Opus 4.6 (1M context) --- lib/marin/src/marin/datakit/download/wikipedia.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/marin/src/marin/datakit/download/wikipedia.py b/lib/marin/src/marin/datakit/download/wikipedia.py index 683c5735aa..c2ccbdbe02 100644 --- a/lib/marin/src/marin/datakit/download/wikipedia.py +++ b/lib/marin/src/marin/datakit/download/wikipedia.py @@ -128,5 +128,5 @@ def _run(output_path: str) -> None: fn=_run, hash_attrs={"input_urls": input_urls}, # NOTE: if no inputs are provided, use the previously downloaded and no longer exposed 2024-12-01 data - override_output_path="raw/wikipedia-a7dad0" if input_urls is None else None, + override_output_path="raw/wikipedia-9273e1" if input_urls is None else None, ) From d99a39290472cbf506e3e1877655c02508676626 Mon Sep 17 00:00:00 2001 From: Rafal Wojdyla Date: Wed, 25 Mar 2026 18:58:54 -0700 Subject: [PATCH 52/56] Restore revision parameter in download_wikipedia The revision creates a subdirectory under output_path for the dump data, matching the original behavior. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../src/marin/datakit/download/wikipedia.py | 20 ++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/lib/marin/src/marin/datakit/download/wikipedia.py b/lib/marin/src/marin/datakit/download/wikipedia.py index c2ccbdbe02..a1a158ff76 100644 --- a/lib/marin/src/marin/datakit/download/wikipedia.py +++ b/lib/marin/src/marin/datakit/download/wikipedia.py @@ -90,15 +90,16 @@ def process_file(input_file: str, output_path: str) -> Iterable[str]: raise e -def download_wikipedia(input_urls: list[str], output_path: str) -> None: +def download_wikipedia(input_urls: list[str], revision: str, output_path: str) -> None: """Download and process Wikipedia data.""" logger.info("Starting transfer of Wikipedia dump...") + output_base = os.path.join(output_path, revision) ctx = ZephyrContext(name="download-wikipedia") download_metrics = ctx.execute( Dataset.from_list(input_urls) - .map(lambda url: download_tar(url, output_path)) - .write_jsonl(f"{output_path}/.metrics/download-{{shard:05d}}.jsonl", skip_existing=True), + .map(lambda url: download_tar(url, output_base)) + .write_jsonl(f"{output_base}/.metrics/download-{{shard:05d}}.jsonl", skip_existing=True), ) # load all of the output filenames to process @@ -106,8 +107,8 @@ def download_wikipedia(input_urls: list[str], output_path: str) -> None: extracted = ctx.execute( Dataset.from_list(downloads) - .flat_map(lambda file: process_file(file, output_path)) - .write_jsonl(f"{output_path}/.metrics/process-{{shard:05d}}.jsonl", skip_existing=True), + .flat_map(lambda file: process_file(file, output_base)) + .write_jsonl(f"{output_base}/.metrics/process-{{shard:05d}}.jsonl", skip_existing=True), ) logger.info("Wikipedia dump transfer complete, wrote: %s", list(extracted)) @@ -116,17 +117,18 @@ def download_wikipedia(input_urls: list[str], output_path: str) -> None: def download_wikipedia_step( *, input_urls: list[str] | None = None, + revision: str = "20241201", ) -> StepSpec: - """Download Wikipedia HTML dumps""" + """Download Wikipedia HTML dumps.""" def _run(output_path: str) -> None: assert input_urls is not None, "input_urls must be provided to download Wikipedia data" - download_wikipedia(input_urls, output_path) + download_wikipedia(input_urls, revision, output_path) return StepSpec( name="raw/wikipedia", fn=_run, - hash_attrs={"input_urls": input_urls}, - # NOTE: if no inputs are provided, use the previously downloaded and no longer exposed 2024-12-01 data + hash_attrs={"input_urls": input_urls, "revision": revision}, + # NOTE: if no inputs are provided, use the previously downloaded 2024-12-01 data override_output_path="raw/wikipedia-9273e1" if input_urls is None else None, ) From 845739fac564151a92f7395b8e6d0f60dbfcc69a Mon Sep 17 00:00:00 2001 From: Rafal Wojdyla Date: Wed, 25 Mar 2026 19:00:49 -0700 Subject: [PATCH 53/56] Make revision required in download_wikipedia_step Both input_urls and revision must be explicitly provided for new downloads. Existing data is still accessed via override_output_path when neither is set. Co-Authored-By: Claude Opus 4.6 (1M context) --- lib/marin/src/marin/datakit/download/wikipedia.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lib/marin/src/marin/datakit/download/wikipedia.py b/lib/marin/src/marin/datakit/download/wikipedia.py index a1a158ff76..a989b1ea97 100644 --- a/lib/marin/src/marin/datakit/download/wikipedia.py +++ b/lib/marin/src/marin/datakit/download/wikipedia.py @@ -117,12 +117,13 @@ def download_wikipedia(input_urls: list[str], revision: str, output_path: str) - def download_wikipedia_step( *, input_urls: list[str] | None = None, - revision: str = "20241201", + revision: str | None = None, ) -> StepSpec: """Download Wikipedia HTML dumps.""" def _run(output_path: str) -> None: assert input_urls is not None, "input_urls must be provided to download Wikipedia data" + assert revision is not None, "revision must be provided to download Wikipedia data" download_wikipedia(input_urls, revision, output_path) return StepSpec( From d5499904746bbb6c2926702bc95f84404f1725ab Mon Sep 17 00:00:00 2001 From: Rafal Wojdyla Date: Wed, 25 Mar 2026 19:03:47 -0700 Subject: [PATCH 54/56] Wire ar5iv_step into exp934 as StepSpec dependency Same pattern as Wikipedia: download step with override pointing at existing data, transform step as StepSpec with download as dep. Co-Authored-By: Claude Opus 4.6 (1M context) --- experiments/exp934_hq_vs_pt.py | 41 +++++++++++++++++++++------------- 1 file changed, 26 insertions(+), 15 deletions(-) diff --git a/experiments/exp934_hq_vs_pt.py b/experiments/exp934_hq_vs_pt.py index fa375a30e8..d062454ae3 100644 --- a/experiments/exp934_hq_vs_pt.py +++ b/experiments/exp934_hq_vs_pt.py @@ -8,6 +8,7 @@ datasets used by various training experiments. """ +from marin.datakit.download.ar5iv import ar5iv_step from marin.datakit.download.wikipedia import download_wikipedia_step from marin.execution.executor import ExecutorStep, mirrored, this_output_path, versioned from marin.execution.step_spec import StepSpec @@ -72,23 +73,33 @@ ) wikipedia_resiliparse_custom_fork = _wikipedia_transform.as_executor_step().cd("20241201") -# ar5iv resiliparse custom fork step (data already exists at hardcoded path) -ar5iv_no_problem_resiliparse_custom_fork = ExecutorStep( +_ar5iv_download = ar5iv_step( + input_path="gs://marin-us-central2/raw/ar5iv/ar5iv-04-2024-no-problem.zip", + override_output_path="raw/ar5iv/ar5iv-04-2024-no-problem-49c4e3", +) + +# ar5iv resiliparse custom fork step +_ar5iv_transform = StepSpec( name="documents/ar5iv/ar5iv-04-2024-no-problem", - fn=process_ar5iv_dump, - config=Ar5ivExtractionConfig( - input_path=mirrored("raw/ar5iv/ar5iv-04-2024-no-problem-49c4e3/202404", budget_gb=1), - revision="042024", - output_path=this_output_path("resiliparse-custom-fork"), - extract_method=versioned("resiliparse"), - extract_config=ResiliparseConfig( - links=versioned(False), - prepend_title=True, - skip_elements=ARXIV_BLACKLISTED_SELECTORS, - ), - remove_reference_section=versioned(True), + fn=lambda output_path: process_ar5iv_dump( + Ar5ivExtractionConfig( + input_path=f"{_ar5iv_download.output_path}/202404", + revision="042024", + output_path=output_path, + extract_method="resiliparse", + extract_config=ResiliparseConfig( + links=False, + prepend_title=True, + skip_elements=ARXIV_BLACKLISTED_SELECTORS, + ), + remove_reference_section=True, + ) ), -).with_output_path("documents/ar5iv/ar5iv-04-2024-no-problem-3971f") + deps=[_ar5iv_download], + hash_attrs={"revision": "042024", "extract_method": "resiliparse"}, + override_output_path="documents/ar5iv/ar5iv-04-2024-no-problem-3971f", +) +ar5iv_no_problem_resiliparse_custom_fork = _ar5iv_transform.as_executor_step() # MMLU Science QA tokenization medu_mmlu_science_qa_tokenized = default_tokenize( From eae62faa186d4e2612833c7123cce34c0f00f2ad Mon Sep 17 00:00:00 2001 From: Rafal Wojdyla Date: Wed, 25 Mar 2026 19:50:29 -0700 Subject: [PATCH 55/56] Remove unused download entries from simple.py fineweb (never downloaded), the_stack_dedup, and the_pile_openwebtext2 have no consumers. Co-Authored-By: Claude Opus 4.6 (1M context) --- experiments/pretraining_datasets/simple.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/experiments/pretraining_datasets/simple.py b/experiments/pretraining_datasets/simple.py index 5fa9a5fa65..9b360e8f61 100644 --- a/experiments/pretraining_datasets/simple.py +++ b/experiments/pretraining_datasets/simple.py @@ -69,7 +69,6 @@ def _build_downloads() -> dict[str, ExecutorStep | InputName]: fineweb_edu_base = _dl("raw/fineweb-edu", "HuggingFaceFW/fineweb-edu", "87f0914", "raw/fineweb-edu-87f0914") return { - "fineweb": _dl("raw/fineweb", "HuggingFaceFW/fineweb", "cd85054", "raw/fineweb"), "fineweb_edu": fineweb_edu_base.cd("data"), "fineweb_edu_sample_10bt": fineweb_edu_base.cd("sample/10BT"), "fineweb_edu_sample_100bt": fineweb_edu_base.cd("sample/100BT"), @@ -93,19 +92,11 @@ def _build_downloads() -> dict[str, ExecutorStep | InputName]: "dclm_baseline": ( _dl("raw/dclm-baseline-1.0", "mlfoundations/dclm-baseline-1.0", "a3b142c", "raw/dclm").cd("a3b142c") ), - "the_stack_dedup": ( - _dl("raw/the-stack-dedup", "bigcode/the-stack-dedup", "17cad72", "raw/the-stack-dedup-4ba450").cd("17cad72") - ), "proofpile_2": ( _dl("raw/proof-pile-2", "EleutherAI/proof-pile-2", "901a927", "raw/proof-pile-2-f1b1d8").cd( "901a927/huggingface.co/datasets/EleutherAI/proof-pile-2/resolve/901a927" ) ), - "the_pile_openwebtext2": ( - _dl("raw/the_pile_openwebtext2", "vietgpt/the_pile_openwebtext2", "1de27c6", "raw/the_pile_openwebtext2").cd( - "1de27c6/huggingface.co/datasets/vietgpt/the_pile_openwebtext2/resolve/1de27c6" - ) - ), "starcoderdata": _dl("raw/starcoderdata", "bigcode/starcoderdata", "9fc30b5", "raw/starcoderdata-720c8c"), } From 6e750844034583d5c9de40cb7cdffed3784b152c Mon Sep 17 00:00:00 2001 From: Rafal Wojdyla Date: Thu, 26 Mar 2026 16:22:15 -0700 Subject: [PATCH 56/56] Address PR review comments - Add append_sha_to_path to download_hf_step, fix dolma3 download which writes files under {output_path}/{revision} (P1 fix) - Flatten ar5iv from package to single ar5iv.py, delete unused ar5iv-v04-2024.json data file Co-Authored-By: Claude Opus 4.6 (1M context) --- experiments/pretraining_datasets/simple.py | 16 +++++++++--- .../download/{ar5iv/download.py => ar5iv.py} | 0 .../marin/datakit/download/ar5iv/__init__.py | 7 ----- .../download/ar5iv/ar5iv-v04-2024.json | 26 ------------------- .../src/marin/datakit/download/huggingface.py | 4 +++ 5 files changed, 17 insertions(+), 36 deletions(-) rename lib/marin/src/marin/datakit/download/{ar5iv/download.py => ar5iv.py} (100%) delete mode 100644 lib/marin/src/marin/datakit/download/ar5iv/__init__.py delete mode 100644 lib/marin/src/marin/datakit/download/ar5iv/ar5iv-v04-2024.json diff --git a/experiments/pretraining_datasets/simple.py b/experiments/pretraining_datasets/simple.py index 9b360e8f61..1e3e4ff6e7 100644 --- a/experiments/pretraining_datasets/simple.py +++ b/experiments/pretraining_datasets/simple.py @@ -53,10 +53,16 @@ def _tokenize_simple( return step -def _dl(name: str, hf_dataset_id: str, revision: str, output_path: str) -> ExecutorStep: +def _dl( + name: str, hf_dataset_id: str, revision: str, output_path: str, *, append_sha_to_path: bool = False +) -> ExecutorStep: """Create a download ExecutorStep from a StepSpec.""" return download_hf_step( - name, hf_dataset_id=hf_dataset_id, revision=revision, override_output_path=output_path + name, + hf_dataset_id=hf_dataset_id, + revision=revision, + append_sha_to_path=append_sha_to_path, + override_output_path=output_path, ).as_executor_step() @@ -83,7 +89,11 @@ def _build_downloads() -> dict[str, ExecutorStep | InputName]: ), "dolma3_mix_150b_1025": ( _dl( - "raw/dolma3_mix-150B-1025", "allenai/dolma3_mix-150B-1025", "15d04ee", "raw/dolma3_mix-150B-1025-15d04ee" + "raw/dolma3_mix-150B-1025", + "allenai/dolma3_mix-150B-1025", + "15d04ee", + "raw/dolma3_mix-150B-1025-15d04ee", + append_sha_to_path=True, ).cd("15d04ee") ), "dclm_baseline_wrong": _dl( diff --git a/lib/marin/src/marin/datakit/download/ar5iv/download.py b/lib/marin/src/marin/datakit/download/ar5iv.py similarity index 100% rename from lib/marin/src/marin/datakit/download/ar5iv/download.py rename to lib/marin/src/marin/datakit/download/ar5iv.py diff --git a/lib/marin/src/marin/datakit/download/ar5iv/__init__.py b/lib/marin/src/marin/datakit/download/ar5iv/__init__.py deleted file mode 100644 index 5d820ef55f..0000000000 --- a/lib/marin/src/marin/datakit/download/ar5iv/__init__.py +++ /dev/null @@ -1,7 +0,0 @@ -# Copyright The Marin Authors -# SPDX-License-Identifier: Apache-2.0 - -from marin.datakit.download.ar5iv.download import Ar5ivDownloadConfig as Ar5ivDownloadConfig -from marin.datakit.download.ar5iv.download import ar5iv_step as ar5iv_step -from marin.datakit.download.ar5iv.download import download as download -from marin.datakit.download.ar5iv.download import process_shard as process_shard diff --git a/lib/marin/src/marin/datakit/download/ar5iv/ar5iv-v04-2024.json b/lib/marin/src/marin/datakit/download/ar5iv/ar5iv-v04-2024.json deleted file mode 100644 index 7c178afb61..0000000000 --- a/lib/marin/src/marin/datakit/download/ar5iv/ar5iv-v04-2024.json +++ /dev/null @@ -1,26 +0,0 @@ -{ - "dataset": "ar5iv", - "version": "v04.2024", - "links": [ - { - "name": "C-UDA-1.0.md", - "url": "", - "checksum": {"type": "md5", "encoding": "hex", "hash": "0476ea786ce0e3291f6eaaabc43e250e"} - }, - { - "name": "ar5iv-04-2024-errors.zip", - "url": "", - "checksum": {"type": "md5", "encoding": "hex", "hash": "9178d9635085a657956402077b4f8301"} - }, - { - "name": "ar5iv-04-2024-no-problem.zip", - "url": "", - "checksum": {"type": "md5", "encoding": "hex", "hash": "6ffa80fa273f29716527db36e1841abf"} - }, - { - "name": "ar5iv-04-2024-warnings.zip", - "url": "", - "checksum": {"type": "md5", "encoding": "hex", "hash": "51582b218f55286e5fe08431eb5e299d"} - } - ] -} diff --git a/lib/marin/src/marin/datakit/download/huggingface.py b/lib/marin/src/marin/datakit/download/huggingface.py index c414df96a9..f6ee228cd5 100644 --- a/lib/marin/src/marin/datakit/download/huggingface.py +++ b/lib/marin/src/marin/datakit/download/huggingface.py @@ -349,6 +349,7 @@ def download_hf_step( hf_dataset_id: str, revision: str, hf_urls_glob: list[str] | None = None, + append_sha_to_path: bool = False, zephyr_max_parallelism: int = 8, deps: list[StepSpec] | None = None, override_output_path: str | None = None, @@ -362,6 +363,7 @@ def download_hf_step( hf_dataset_id: HuggingFace dataset identifier (e.g. "HuggingFaceFW/fineweb"). revision: Commit hash from the HF dataset repo. hf_urls_glob: Glob patterns to select specific files. Empty means all files. + append_sha_to_path: If True, write outputs under ``output_path/``. zephyr_max_parallelism: Maximum download parallelism. deps: Optional upstream dependencies. override_output_path: Override the computed output path entirely. @@ -378,6 +380,7 @@ def _run(output_path: str) -> None: revision=revision, hf_urls_glob=resolved_glob, gcs_output_path=output_path, + append_sha_to_path=append_sha_to_path, zephyr_max_parallelism=zephyr_max_parallelism, ) ) @@ -390,6 +393,7 @@ def _run(output_path: str) -> None: "hf_dataset_id": hf_dataset_id, "revision": revision, "hf_urls_glob": resolved_glob, + "append_sha_to_path": append_sha_to_path, }, override_output_path=override_output_path, )