-
Notifications
You must be signed in to change notification settings - Fork 103
Add starcoder2data-extras download and tokenization #4599
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,45 @@ | ||
| # Copyright The Marin Authors | ||
| # SPDX-License-Identifier: Apache-2.0 | ||
|
|
||
| """StarCoder2 data extras: download and tokenize ir_cpp, ir_python, ir_rust, ir_low_resource, documentation.""" | ||
|
|
||
| from experiments.defaults import default_tokenize | ||
| from experiments.marin_models import marin_tokenizer | ||
| from fray.v2 import ResourceConfig | ||
| from levanter.data.text.formats import TextLmDatasetFormat | ||
| from marin.datakit.download.starcoder2_extras import ( | ||
| SUBSETS, | ||
| download_starcoder2_extras_step, | ||
| reshard_starcoder2_extras_step, | ||
| ) | ||
| from marin.execution.executor import executor_main | ||
| from marin.processing.tokenize.data_configs import TokenizerStep | ||
|
|
||
| WORKER_RAM = {"ir_low_resource": "80g"} | ||
| DEFAULT_WORKER_RAM = "40g" | ||
|
|
||
|
|
||
| def tokenize_starcoder2_extras(*, tokenizer: str = marin_tokenizer) -> list[TokenizerStep]: | ||
| """Download and tokenize all selected starcoder2data-extras subsets.""" | ||
| steps = [] | ||
| RESHARD_SUBSETS = {"ir_low_resource"} | ||
| for subset in SUBSETS: | ||
| if subset in RESHARD_SUBSETS: | ||
| download = reshard_starcoder2_extras_step(subset) | ||
| else: | ||
| download = download_starcoder2_extras_step(subset) | ||
| ram = WORKER_RAM.get(subset, DEFAULT_WORKER_RAM) | ||
| steps.append( | ||
| default_tokenize( | ||
| name=f"starcoder2_extras/{subset}", | ||
| dataset=download.as_executor_step(), | ||
| tokenizer=tokenizer, | ||
| format=TextLmDatasetFormat(text_key="content"), | ||
| worker_resources=ResourceConfig(ram=ram, disk="10g"), | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Useful? React with 👍 / 👎. |
||
| ) | ||
|
Comment on lines
+33
to
+39
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🤖 Bug:
def default_tokenize(
name: str,
dataset: InputName | ExecutorStep | str | HfDatasetSpec,
tokenizer: str,
format: LmDatasetFormatBase = TextLmDatasetFormat(),
*,
sample_count: int | VersionedValue[int] | None = None,
is_validation: bool = False,
) -> ExecutorStep:There is no To set custom worker resources, you'll need to use from marin.processing.tokenize import TokenizeConfig, tokenize
from marin.execution.executor import ExecutorStep, this_output_path, versioned
step = ExecutorStep(
name=f"tokenized/starcoder2_extras/{subset}",
fn=tokenize,
config=TokenizeConfig(
train_paths=[download.as_executor_step()],
validation_paths=versioned([]),
cache_path=this_output_path(),
tokenizer=versioned(tokenizer),
format=TextLmDatasetFormat(text_key="content"),
worker_resources=ResourceConfig(ram=ram, disk="10g"),
),
) |
||
| ) | ||
| return steps | ||
|
|
||
|
|
||
| if __name__ == "__main__": | ||
| executor_main(steps=tokenize_starcoder2_extras()) | ||
| Original file line number | Diff line number | Diff line change | ||||||||||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| @@ -0,0 +1,84 @@ | ||||||||||||||||||||||||
| # Copyright The Marin Authors | ||||||||||||||||||||||||
| # SPDX-License-Identifier: Apache-2.0 | ||||||||||||||||||||||||
|
|
||||||||||||||||||||||||
| """Download subsets of the bigcode/starcoder2data-extras dataset from HuggingFace. | ||||||||||||||||||||||||
|
|
||||||||||||||||||||||||
| Subsets: ir_cpp, ir_python, ir_rust, ir_low_resource, documentation, kaggle. | ||||||||||||||||||||||||
| """ | ||||||||||||||||||||||||
|
|
||||||||||||||||||||||||
| from marin.datakit.download.huggingface import download_hf_step | ||||||||||||||||||||||||
| from marin.execution.step_spec import StepSpec | ||||||||||||||||||||||||
|
|
||||||||||||||||||||||||
| HF_DATASET_ID = "bigcode/starcoder2data-extras" | ||||||||||||||||||||||||
| HF_REVISION = "1ba0d4f" | ||||||||||||||||||||||||
|
|
||||||||||||||||||||||||
| SUBSETS = ["ir_cpp", "ir_python", "ir_rust", "ir_low_resource", "documentation", "kaggle"] | ||||||||||||||||||||||||
|
|
||||||||||||||||||||||||
|
|
||||||||||||||||||||||||
| def download_starcoder2_extras_step(subset: str) -> StepSpec: | ||||||||||||||||||||||||
| """Download a single subset of the starcoder2data-extras dataset.""" | ||||||||||||||||||||||||
| return download_hf_step( | ||||||||||||||||||||||||
| f"raw/starcoder2_extras/{subset}", | ||||||||||||||||||||||||
| hf_dataset_id=HF_DATASET_ID, | ||||||||||||||||||||||||
| revision=HF_REVISION, | ||||||||||||||||||||||||
| hf_urls_glob=[f"{subset}/*.parquet"], | ||||||||||||||||||||||||
| override_output_path=f"raw/starcoder2_extras-{HF_REVISION}/{subset}", | ||||||||||||||||||||||||
| ) | ||||||||||||||||||||||||
|
|
||||||||||||||||||||||||
|
|
||||||||||||||||||||||||
| def reshard_starcoder2_extras_step(subset: str, target_shard_mb: int = 200) -> StepSpec: | ||||||||||||||||||||||||
| """Reshard a downloaded subset into more evenly-sized parquet files.""" | ||||||||||||||||||||||||
| raw = download_starcoder2_extras_step(subset) | ||||||||||||||||||||||||
| raw_output_path = raw.output_path | ||||||||||||||||||||||||
|
|
||||||||||||||||||||||||
| def _run(output_path: str) -> None: | ||||||||||||||||||||||||
| import logging | ||||||||||||||||||||||||
|
|
||||||||||||||||||||||||
| import pyarrow.parquet as pq | ||||||||||||||||||||||||
| from rigging.filesystem import url_to_fs | ||||||||||||||||||||||||
|
|
||||||||||||||||||||||||
| logger = logging.getLogger(__name__) | ||||||||||||||||||||||||
| input_path = raw_output_path | ||||||||||||||||||||||||
| fs, _ = url_to_fs(input_path) | ||||||||||||||||||||||||
| files = sorted(f"gs://{f}" for f in fs.glob(f"{input_path}/**/*.parquet") if not f.endswith("/.parquet")) | ||||||||||||||||||||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🤖 Nit: hardcoded
from urllib.parse import urlparse
scheme = urlparse(input_path).scheme
prefix = f"{scheme}://" if scheme else ""
files = sorted(f"{prefix}{f}" for f in fs.glob(...) if not f.endswith("/.parquet")) |
||||||||||||||||||||||||
|
|
||||||||||||||||||||||||
| # Read all files, split into evenly-sized output shards | ||||||||||||||||||||||||
| target_bytes = target_shard_mb * 1024 * 1024 | ||||||||||||||||||||||||
| shard_idx = 0 | ||||||||||||||||||||||||
| for file_path in files: | ||||||||||||||||||||||||
| meta = pq.read_metadata(file_path) | ||||||||||||||||||||||||
| if meta.serialized_size <= target_bytes: | ||||||||||||||||||||||||
|
Comment on lines
+49
to
+50
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🤖 Bug:
To get the actual data size, sum the row group sizes:
Suggested change
And update line 60 similarly: rows_per_shard = max(1, (table.num_rows * target_bytes) // data_size)
Comment on lines
+49
to
+50
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
This logic uses Useful? React with 👍 / 👎. |
||||||||||||||||||||||||
| # Small file — copy as-is | ||||||||||||||||||||||||
| out = f"{output_path}/shard-{shard_idx:05d}.parquet" | ||||||||||||||||||||||||
| table = pq.read_table(file_path) | ||||||||||||||||||||||||
| pq.write_table(table, out) | ||||||||||||||||||||||||
| logger.info(f"Copied {file_path} -> {out} ({table.num_rows} rows)") | ||||||||||||||||||||||||
| shard_idx += 1 | ||||||||||||||||||||||||
| else: | ||||||||||||||||||||||||
| # Big file — split by row groups or by row count | ||||||||||||||||||||||||
| table = pq.read_table(file_path) | ||||||||||||||||||||||||
| rows_per_shard = max(1, (table.num_rows * target_bytes) // meta.serialized_size) | ||||||||||||||||||||||||
| offset = 0 | ||||||||||||||||||||||||
| while offset < table.num_rows: | ||||||||||||||||||||||||
| chunk = table.slice(offset, min(rows_per_shard, table.num_rows - offset)) | ||||||||||||||||||||||||
| out = f"{output_path}/shard-{shard_idx:05d}.parquet" | ||||||||||||||||||||||||
| pq.write_table(chunk, out) | ||||||||||||||||||||||||
| logger.info( | ||||||||||||||||||||||||
| f"Split {file_path}[{offset}:{offset + chunk.num_rows}] -> {out} ({chunk.num_rows} rows)" | ||||||||||||||||||||||||
| ) | ||||||||||||||||||||||||
| shard_idx += 1 | ||||||||||||||||||||||||
| offset += chunk.num_rows | ||||||||||||||||||||||||
| del table | ||||||||||||||||||||||||
|
|
||||||||||||||||||||||||
| logger.info(f"Resharded {len(files)} files into {shard_idx} shards") | ||||||||||||||||||||||||
|
|
||||||||||||||||||||||||
| return StepSpec( | ||||||||||||||||||||||||
| name=f"resharded/starcoder2_extras/{subset}", | ||||||||||||||||||||||||
| fn=_run, | ||||||||||||||||||||||||
| deps=[raw], | ||||||||||||||||||||||||
| ) | ||||||||||||||||||||||||
|
Comment on lines
+75
to
+79
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🤖 Missing The
Suggested change
|
||||||||||||||||||||||||
|
|
||||||||||||||||||||||||
|
|
||||||||||||||||||||||||
| def download_all_starcoder2_extras_steps() -> list[StepSpec]: | ||||||||||||||||||||||||
| """Download all selected subsets of starcoder2data-extras.""" | ||||||||||||||||||||||||
| return [download_starcoder2_extras_step(subset) for subset in SUBSETS] | ||||||||||||||||||||||||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
🤖 Nit: docstring missing "kaggle"
The docstring lists subsets but omits "kaggle", which is in
SUBSETS.