marin-community · Helw150 · Apr 9, 2026 · claude · Apr 9, 2026 · chatgpt-codex-connector
diff --git a/experiments/pretraining_datasets/starcoder2_extras.py b/experiments/pretraining_datasets/starcoder2_extras.py
@@ -0,0 +1,45 @@
+# Copyright The Marin Authors
+# SPDX-License-Identifier: Apache-2.0
+
+"""StarCoder2 data extras: download and tokenize ir_cpp, ir_python, ir_rust, ir_low_resource, documentation."""
-"""StarCoder2 data extras: download and tokenize ir_cpp, ir_python, ir_rust, ir_low_resource, documentation."""
+"""StarCoder2 data extras: download and tokenize ir_cpp, ir_python, ir_rust, ir_low_resource, documentation, kaggle."""
-"""StarCoder2 data extras: download and tokenize ir_cpp, ir_python, ir_rust, ir_low_resource, documentation."""
+"""StarCoder2 data extras: download and tokenize ir_cpp, ir_python, ir_rust, ir_low_resource, documentation, kaggle."""
+
+from experiments.defaults import default_tokenize
+from experiments.marin_models import marin_tokenizer
+from fray.v2 import ResourceConfig
+from levanter.data.text.formats import TextLmDatasetFormat
+from marin.datakit.download.starcoder2_extras import (
+    SUBSETS,
+    download_starcoder2_extras_step,
+    reshard_starcoder2_extras_step,
+)
+from marin.execution.executor import executor_main
+from marin.processing.tokenize.data_configs import TokenizerStep
+
+WORKER_RAM = {"ir_low_resource": "80g"}
+DEFAULT_WORKER_RAM = "40g"
+
+
+def tokenize_starcoder2_extras(*, tokenizer: str = marin_tokenizer) -> list[TokenizerStep]:
+    """Download and tokenize all selected starcoder2data-extras subsets."""
+    steps = []
+    RESHARD_SUBSETS = {"ir_low_resource"}
+    for subset in SUBSETS:
+        if subset in RESHARD_SUBSETS:
+            download = reshard_starcoder2_extras_step(subset)
+        else:
+            download = download_starcoder2_extras_step(subset)
+        ram = WORKER_RAM.get(subset, DEFAULT_WORKER_RAM)
+        steps.append(
+            default_tokenize(
+                name=f"starcoder2_extras/{subset}",
+                dataset=download.as_executor_step(),
+                tokenizer=tokenizer,
+                format=TextLmDatasetFormat(text_key="content"),
+                worker_resources=ResourceConfig(ram=ram, disk="10g"),
+            )
+        )
+    return steps
+
+
+if __name__ == "__main__":
+    executor_main(steps=tokenize_starcoder2_extras())
diff --git a/lib/marin/src/marin/datakit/download/starcoder2_extras.py b/lib/marin/src/marin/datakit/download/starcoder2_extras.py
@@ -0,0 +1,84 @@
+# Copyright The Marin Authors
+# SPDX-License-Identifier: Apache-2.0
+
+"""Download subsets of the bigcode/starcoder2data-extras dataset from HuggingFace.
+
+Subsets: ir_cpp, ir_python, ir_rust, ir_low_resource, documentation, kaggle.
+"""
+
+from marin.datakit.download.huggingface import download_hf_step
+from marin.execution.step_spec import StepSpec
+
+HF_DATASET_ID = "bigcode/starcoder2data-extras"
+HF_REVISION = "1ba0d4f"
+
+SUBSETS = ["ir_cpp", "ir_python", "ir_rust", "ir_low_resource", "documentation", "kaggle"]
+
+
+def download_starcoder2_extras_step(subset: str) -> StepSpec:
+    """Download a single subset of the starcoder2data-extras dataset."""
+    return download_hf_step(
+        f"raw/starcoder2_extras/{subset}",
+        hf_dataset_id=HF_DATASET_ID,
+        revision=HF_REVISION,
+        hf_urls_glob=[f"{subset}/*.parquet"],
+        override_output_path=f"raw/starcoder2_extras-{HF_REVISION}/{subset}",
+    )
+
+
+def reshard_starcoder2_extras_step(subset: str, target_shard_mb: int = 200) -> StepSpec:
+    """Reshard a downloaded subset into more evenly-sized parquet files."""
+    raw = download_starcoder2_extras_step(subset)
+    raw_output_path = raw.output_path
+
+    def _run(output_path: str) -> None:
+        import logging
+
+        import pyarrow.parquet as pq
+        from rigging.filesystem import url_to_fs
+
+        logger = logging.getLogger(__name__)
+        input_path = raw_output_path
+        fs, _ = url_to_fs(input_path)
+        files = sorted(f"gs://{f}" for f in fs.glob(f"{input_path}/**/*.parquet") if not f.endswith("/.parquet"))
+
+        # Read all files, split into evenly-sized output shards
+        target_bytes = target_shard_mb * 1024 * 1024
+        shard_idx = 0
+        for file_path in files:
+            meta = pq.read_metadata(file_path)
+            if meta.serialized_size <= target_bytes:
-            meta = pq.read_metadata(file_path)
-            if meta.serialized_size <= target_bytes:
+            data_size = sum(meta.row_group(i).total_byte_size for i in range(meta.num_row_groups))
+            if data_size <= target_bytes:
-            meta = pq.read_metadata(file_path)
-            if meta.serialized_size <= target_bytes:
+            data_size = sum(meta.row_group(i).total_byte_size for i in range(meta.num_row_groups))
+            if data_size <= target_bytes:
+                # Small file — copy as-is
+                out = f"{output_path}/shard-{shard_idx:05d}.parquet"
+                table = pq.read_table(file_path)
+                pq.write_table(table, out)
+                logger.info(f"Copied {file_path} -> {out} ({table.num_rows} rows)")
+                shard_idx += 1
+            else:
+                # Big file — split by row groups or by row count
+                table = pq.read_table(file_path)
+                rows_per_shard = max(1, (table.num_rows * target_bytes) // meta.serialized_size)
+                offset = 0
+                while offset < table.num_rows:
+                    chunk = table.slice(offset, min(rows_per_shard, table.num_rows - offset))
+                    out = f"{output_path}/shard-{shard_idx:05d}.parquet"
+                    pq.write_table(chunk, out)
+                    logger.info(
+                        f"Split {file_path}[{offset}:{offset + chunk.num_rows}] -> {out} ({chunk.num_rows} rows)"
+                    )
+                    shard_idx += 1
+                    offset += chunk.num_rows
+                del table
+
+        logger.info(f"Resharded {len(files)} files into {shard_idx} shards")
+
+    return StepSpec(
+        name=f"resharded/starcoder2_extras/{subset}",
+        fn=_run,
+        deps=[raw],
+    )
-    return StepSpec(
-        name=f"resharded/starcoder2_extras/{subset}",
-        fn=_run,
-        deps=[raw],
-    )
+    return StepSpec(
+        name=f"resharded/starcoder2_extras/{subset}",
+        fn=_run,
+        deps=[raw],
+        hash_attrs={"target_shard_mb": target_shard_mb},
+    )
-    return StepSpec(
-        name=f"resharded/starcoder2_extras/{subset}",
-        fn=_run,
-        deps=[raw],
-    )
+    return StepSpec(
+        name=f"resharded/starcoder2_extras/{subset}",
+        fn=_run,
+        deps=[raw],
+        hash_attrs={"target_shard_mb": target_shard_mb},
+    )
+
+
+def download_all_starcoder2_extras_steps() -> list[StepSpec]:
+    """Download all selected subsets of starcoder2data-extras."""
+    return [download_starcoder2_extras_step(subset) for subset in SUBSETS]