Skip to content

Commit eacfcf2

Browse files
committed
Merge remote-tracking branch 'origin/main' into codex/perplexity-gap-lazy-boundaries
2 parents 4afdd4f + 8775c5b commit eacfcf2

50 files changed

Lines changed: 1756 additions & 1148 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.
Lines changed: 141 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,141 @@
1+
name: Marin - Datakit Nemotron Ferry
2+
3+
on:
4+
schedule:
5+
- cron: '0 1 * * 1' # Weekly, Monday 01:00 UTC
6+
workflow_dispatch:
7+
8+
permissions:
9+
contents: read
10+
11+
jobs:
12+
datakit-nemotron-ferry:
13+
runs-on: ubuntu-latest
14+
timeout-minutes: 1440 # 24h — nemotron medium is ~3.4 TiB
15+
concurrency:
16+
group: datakit-nemotron-ferry
17+
cancel-in-progress: true
18+
env:
19+
SMOKE_RUN_ID: datakit-nemotron-ferry-${{ github.run_id }}-${{ github.run_attempt }}
20+
FERRY_STATUS_PATH: gs://marin-tmp-eu-west4/ttl=1d/ci/datakit-nemotron-ferry-${{ github.run_id }}-${{ github.run_attempt }}/ferry_run_status.json
21+
WANDB_ENTITY: marin-community
22+
WANDB_PROJECT: marin
23+
IRIS_CONFIG: lib/iris/examples/marin.yaml
24+
IRIS_CONTROLLER_SERVICE_ACCOUNT: iris-controller@hai-gcp-models.iam.gserviceaccount.com
25+
26+
steps:
27+
- name: Checkout code
28+
uses: actions/checkout@v4
29+
30+
- name: Set up Python 3.12
31+
uses: actions/setup-python@v5
32+
with:
33+
python-version: "3.12"
34+
35+
- name: Install uv
36+
uses: astral-sh/setup-uv@v7
37+
with:
38+
enable-cache: true
39+
40+
- name: Install dependencies
41+
run: uv sync --all-packages --extra=cpu --no-default-groups
42+
43+
- name: Authenticate to Google Cloud
44+
uses: google-github-actions/auth@v2
45+
with:
46+
credentials_json: ${{ secrets.IRIS_CI_GCP_SA_KEY }}
47+
48+
- name: Set up Google Cloud SDK
49+
uses: google-github-actions/setup-gcloud@v2
50+
with:
51+
project_id: ${{ secrets.GCP_PROJECT_ID }}
52+
53+
- name: Install SSH key
54+
env:
55+
SSH_KEY: ${{ secrets.IRIS_CI_GCP_SSH_KEY }}
56+
SSH_KEY_PUB: ${{ secrets.IRIS_CI_GCP_SSH_KEY_PUB }}
57+
run: |
58+
mkdir -p ~/.ssh
59+
printf '%s\n' "$SSH_KEY" > ~/.ssh/google_compute_engine
60+
printf '%s\n' "$SSH_KEY_PUB" > ~/.ssh/google_compute_engine.pub
61+
chmod 600 ~/.ssh/google_compute_engine
62+
chmod 644 ~/.ssh/google_compute_engine.pub
63+
64+
- name: Submit datakit nemotron ferry
65+
id: submit
66+
shell: bash -l {0}
67+
run: |
68+
JOB_ID=$(.venv/bin/iris --config=${{ env.IRIS_CONFIG }} \
69+
job run --no-wait \
70+
--region=europe-west4 \
71+
--memory=3G --disk=5G --cpu=1 --extra=cpu \
72+
--priority production \
73+
-e SMOKE_RUN_ID "$SMOKE_RUN_ID" \
74+
-e FERRY_STATUS_PATH "$FERRY_STATUS_PATH" \
75+
-e WANDB_ENTITY "$WANDB_ENTITY" \
76+
-e WANDB_PROJECT "$WANDB_PROJECT" \
77+
-e WANDB_API_KEY "$WANDB_API_KEY" \
78+
-e HF_TOKEN "$HF_TOKEN" \
79+
-- python -m experiments.ferries.datakit_nemotron_ferry)
80+
echo "job_id=$JOB_ID" >> "$GITHUB_OUTPUT"
81+
echo "Submitted job: $JOB_ID"
82+
env:
83+
WANDB_API_KEY: ${{ secrets.WANDB_API_KEY }}
84+
HF_TOKEN: ${{ secrets.HF_TOKEN }}
85+
86+
- name: Wait for datakit nemotron ferry
87+
shell: bash -l {0}
88+
run: |
89+
JOB_ID="${{ steps.submit.outputs.job_id }}"
90+
echo "Polling job status: $JOB_ID"
91+
while true; do
92+
STATE=$(.venv/bin/iris --config=${{ env.IRIS_CONFIG }} \
93+
job list --json --prefix "$JOB_ID" \
94+
| jq -r --arg id "$JOB_ID" '[.[] | select(.job_id == $id)][0].state // empty')
95+
case "$STATE" in
96+
JOB_STATE_SUCCEEDED)
97+
echo "Job succeeded"
98+
exit 0
99+
;;
100+
JOB_STATE_PENDING|JOB_STATE_BUILDING|JOB_STATE_RUNNING)
101+
echo "$(date -u +%H:%M:%S) Job state: $STATE"
102+
sleep 30
103+
;;
104+
"")
105+
echo "Job not found: $JOB_ID"
106+
exit 1
107+
;;
108+
*)
109+
echo "Job finished with state: $STATE"
110+
.venv/bin/iris --config=${{ env.IRIS_CONFIG }} \
111+
job list --json --prefix "$JOB_ID" \
112+
| jq --arg id "$JOB_ID" '.[] | {job_id, state, error}' || true
113+
exit 1
114+
;;
115+
esac
116+
done
117+
118+
- name: Capture failure diagnostics
119+
if: failure() || cancelled()
120+
run: |
121+
echo "=== Controller logs ==="
122+
.venv/bin/iris --config=${{ env.IRIS_CONFIG }} \
123+
process logs --max-lines=200 || true
124+
echo "=== Job list ==="
125+
.venv/bin/iris --config=${{ env.IRIS_CONFIG }} \
126+
job list --json 2>/dev/null | jq '.[0:5]' || true
127+
128+
# Separate job so Slack always fires, even if the main job is force-killed
129+
# after its grace window. See the datakit-smoke workflow for rationale.
130+
notify-slack:
131+
needs: datakit-nemotron-ferry
132+
if: always() && (needs.datakit-nemotron-ferry.result == 'failure' || needs.datakit-nemotron-ferry.result == 'cancelled') && github.event_name == 'schedule'
133+
runs-on: ubuntu-latest
134+
steps:
135+
- name: Notify Slack
136+
env:
137+
SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }}
138+
TEXT: ":red_circle: *Datakit Nemotron Ferry failed*\nRun: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
139+
run: |
140+
PAYLOAD=$(python3 -c "import sys,json; print(json.dumps({'text': sys.stdin.read()}))" <<< "$TEXT")
141+
curl -sf -X POST -H 'Content-Type: application/json' -d "$PAYLOAD" "$SLACK_WEBHOOK_URL"

experiments/defaults.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -207,6 +207,7 @@ def default_tokenize(
207207
sample_count: int | VersionedValue[int] | None = None,
208208
is_validation: bool = False,
209209
levanter_batch_size: int | None = None,
210+
tags: Sequence[str] = (),
210211
resources: ResourceConfig | None = None,
211212
worker_resources: ResourceConfig | None = None,
212213
) -> ExecutorStep:
@@ -227,6 +228,7 @@ def default_tokenize(
227228
for more details.
228229
sample_count: Optional limit on the number of samples to tokenize per shard. If ``None``, tokenize everything.
229230
is_validation: Whether the dataset is a validation set. Doesn't do anything for HF datasets.
231+
tags: Tags to attach to the Levanter dataset source for tagged evaluation.
230232
Returns:
231233
An ExecutorStep that represents the tokenized dataset.
232234
"""
@@ -246,6 +248,7 @@ def default_tokenize(
246248
format=format,
247249
sample_count=ensure_versioned(sample_count) if sample_count is not None else None,
248250
levanter_batch_size=levanter_batch_size,
251+
tags=[*tags],
249252
**extra_kwargs,
250253
)
251254
elif (
@@ -261,6 +264,7 @@ def default_tokenize(
261264
format=format,
262265
sample_count=ensure_versioned(sample_count) if sample_count is not None else None,
263266
levanter_batch_size=levanter_batch_size,
267+
tags=[*tags],
264268
**extra_kwargs,
265269
)
266270
else:
@@ -272,6 +276,7 @@ def default_tokenize(
272276
format=format,
273277
sample_count=ensure_versioned(sample_count) if sample_count is not None else None,
274278
levanter_batch_size=levanter_batch_size,
279+
tags=[*tags],
275280
**extra_kwargs,
276281
)
277282

Lines changed: 197 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,197 @@
1+
# Copyright The Marin Authors
2+
# SPDX-License-Identifier: Apache-2.0
3+
4+
"""
5+
FineWeb2 multilingual data and held-out eval bundles.
6+
7+
The eval bundle tokenizes FineWeb2's per-language ``test`` split directly from Hugging Face parquet files. This avoids
8+
downloading the full train split while still making held-out documents available as Levanter validation caches.
9+
"""
10+
11+
import os.path
12+
from collections.abc import Sequence
13+
from typing import Literal
14+
15+
from experiments.defaults import default_tokenize
16+
from experiments.llama import llama3_tokenizer
17+
from marin.execution.executor import executor_main
18+
from marin.processing.tokenize.data_configs import TokenizerStep
19+
20+
FINEWEB2_DATASET_ID = "HuggingFaceFW/fineweb-2"
21+
FINEWEB2_PARQUET_REVISION = "345aeeb34ec379862323beb9b5530d9e7f94522d"
22+
FineWeb2Split = Literal["train", "test"]
23+
LevanterCacheSplit = Literal["train", "validation"]
24+
FINEWEB2_EVAL_SPLIT: FineWeb2Split = "test"
25+
26+
# Top 50 configs by total row count from the Hugging Face Dataset Viewer /size endpoint for
27+
# HuggingFaceFW/fineweb-2 source revision af9c13333eb981300149d5ca60a8e9d659b276b9.
28+
FINEWEB2_TOP_50_BY_ROWS = (
29+
"rus_Cyrl",
30+
"cmn_Hani",
31+
"deu_Latn",
32+
"jpn_Jpan",
33+
"fra_Latn",
34+
"ita_Latn",
35+
"por_Latn",
36+
"pol_Latn",
37+
"nld_Latn",
38+
"ind_Latn",
39+
"ces_Latn",
40+
"arb_Arab",
41+
"vie_Latn",
42+
"kor_Hang",
43+
"swe_Latn",
44+
"fas_Arab",
45+
"ron_Latn",
46+
"ukr_Cyrl",
47+
"hun_Latn",
48+
"ell_Grek",
49+
"dan_Latn",
50+
"nob_Latn",
51+
"fin_Latn",
52+
"tha_Thai",
53+
"slk_Latn",
54+
"bul_Cyrl",
55+
"hin_Deva",
56+
"bos_Latn",
57+
"cat_Latn",
58+
"ben_Beng",
59+
"heb_Hebr",
60+
"lit_Latn",
61+
"slv_Latn",
62+
"ekk_Latn",
63+
"zsm_Latn",
64+
"als_Latn",
65+
"lvs_Latn",
66+
"azj_Latn",
67+
"hrv_Latn",
68+
"tam_Taml",
69+
"npi_Deva",
70+
"urd_Arab",
71+
"mkd_Cyrl",
72+
"srp_Cyrl",
73+
"mar_Deva",
74+
"kat_Geor",
75+
"kaz_Cyrl",
76+
"mal_Mlym",
77+
"isl_Latn",
78+
"glg_Latn",
79+
)
80+
81+
# Native-script South Asian/Indic configs available in FineWeb2, including every config written in an Indic script.
82+
# Romanized variants are deliberately omitted so the supplement tracks the primary written form of each language.
83+
FINEWEB2_INDIC_LANGUAGE_CONFIGS = (
84+
"anp_Deva",
85+
"asm_Beng",
86+
"awa_Deva",
87+
"ben_Beng",
88+
"bho_Deva",
89+
"bpy_Beng",
90+
"brx_Deva",
91+
"div_Thaa",
92+
"doi_Deva",
93+
"gom_Deva",
94+
"grt_Beng",
95+
"guj_Gujr",
96+
"hin_Deva",
97+
"hne_Deva",
98+
"kan_Knda",
99+
"kas_Arab",
100+
"kas_Deva",
101+
"kle_Deva",
102+
"lif_Deva",
103+
"mag_Deva",
104+
"mai_Deva",
105+
"mal_Mlym",
106+
"mar_Deva",
107+
"mni_Beng",
108+
"mni_Mtei",
109+
"mup_Deva",
110+
"new_Deva",
111+
"npi_Deva",
112+
"ory_Orya",
113+
"pan_Guru",
114+
"rav_Deva",
115+
"san_Deva",
116+
"sat_Olck",
117+
"sck_Deva",
118+
"sin_Sinh",
119+
"skr_Arab",
120+
"snd_Arab",
121+
"snd_Deva",
122+
"suz_Deva",
123+
"taj_Deva",
124+
"tam_Taml",
125+
"tcy_Knda",
126+
"tel_Telu",
127+
"thl_Deva",
128+
"urd_Arab",
129+
"xsr_Deva",
130+
)
131+
132+
FINEWEB2_MULTILINGUAL_EVAL_CONFIGS = tuple(dict.fromkeys((*FINEWEB2_TOP_50_BY_ROWS, *FINEWEB2_INDIC_LANGUAGE_CONFIGS)))
133+
134+
_FINEWEB2_TOP_50_BY_ROWS_SET = frozenset(FINEWEB2_TOP_50_BY_ROWS)
135+
_FINEWEB2_INDIC_LANGUAGE_CONFIGS_SET = frozenset(FINEWEB2_INDIC_LANGUAGE_CONFIGS)
136+
137+
138+
def fineweb2_multilingual_parquet_pattern(config: str, split: FineWeb2Split) -> str:
139+
"""Return the pinned Hugging Face parquet pattern for a FineWeb2 language config split."""
140+
return f"hf://datasets/{FINEWEB2_DATASET_ID}@{FINEWEB2_PARQUET_REVISION}/{config}/{split}/*.parquet"
141+
142+
143+
def fineweb2_multilingual_tags(config: str) -> list[str]:
144+
"""Return Levanter eval tags for aggregate multilingual, script, language, and subset metrics."""
145+
assert "_" in config, f"Expected FineWeb2 config in lang_Script form, got {config!r}"
146+
language, script = config.rsplit("_", maxsplit=1)
147+
tags = [
148+
"fineweb2_multilingual",
149+
f"fineweb2_multilingual/script/{script}",
150+
f"fineweb2_multilingual/language/{language}",
151+
]
152+
if config in _FINEWEB2_TOP_50_BY_ROWS_SET:
153+
tags.append("fineweb2_multilingual/top_50_by_rows")
154+
if config in _FINEWEB2_INDIC_LANGUAGE_CONFIGS_SET:
155+
tags.append("fineweb2_multilingual/indic")
156+
return tags
157+
158+
159+
def fineweb2_multilingual_tokenized(
160+
*,
161+
split: FineWeb2Split,
162+
configs: Sequence[str] = FINEWEB2_MULTILINGUAL_EVAL_CONFIGS,
163+
cache_split: LevanterCacheSplit = "train",
164+
name_prefix: str | None = None,
165+
tokenizer: str = llama3_tokenizer,
166+
) -> dict[str, TokenizerStep]:
167+
"""Return tokenization steps for selected FineWeb2 multilingual configs and split."""
168+
steps: dict[str, TokenizerStep] = {}
169+
if name_prefix is None:
170+
name_prefix = os.path.join("fineweb2_multilingual", split)
171+
for config in configs:
172+
name = os.path.join(name_prefix, config)
173+
steps[name] = default_tokenize(
174+
name=name,
175+
dataset=fineweb2_multilingual_parquet_pattern(config, split),
176+
tokenizer=tokenizer,
177+
is_validation=cache_split == "validation",
178+
tags=fineweb2_multilingual_tags(config),
179+
)
180+
return steps
181+
182+
183+
def fineweb2_multilingual_eval_bundle(*, tokenizer: str = llama3_tokenizer) -> dict[str, TokenizerStep]:
184+
"""Return the opt-in tokenization bundle for selected FineWeb2 multilingual held-out eval configs."""
185+
return fineweb2_multilingual_tokenized(
186+
split=FINEWEB2_EVAL_SPLIT,
187+
cache_split="validation",
188+
name_prefix="fineweb2_multilingual_eval",
189+
tokenizer=tokenizer,
190+
)
191+
192+
193+
if __name__ == "__main__":
194+
executor_main(
195+
steps=list(fineweb2_multilingual_eval_bundle().values()),
196+
description="Tokenize FineWeb2 multilingual held-out eval sets",
197+
)

0 commit comments

Comments
 (0)