Skip to content

Commit dd6d49f

Browse files
authored
Merge branch 'main' into agent/20260324-fix-4082
2 parents e3a9fb2 + 69135bf commit dd6d49f

47 files changed

Lines changed: 1931 additions & 906 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.github/workflows/claude-review.yml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
name: Claude PR Review
22

33
on:
4-
pull_request_target:
4+
pull_request:
55
types: [opened, ready_for_review, reopened]
66
issue_comment:
77
# NOTE: there's no PR comment specific event, so instead we need to check all comments and filter in the job.
@@ -16,7 +16,7 @@ jobs:
1616
review:
1717
if: >-
1818
(
19-
github.event_name == 'pull_request_target' &&
19+
github.event_name == 'pull_request' &&
2020
(
2121
github.event.pull_request.author_association == 'OWNER' ||
2222
github.event.pull_request.author_association == 'MEMBER' ||
@@ -50,7 +50,7 @@ jobs:
5050
uses: anthropics/claude-code-action@v1
5151
with:
5252
claude_code_oauth_token: ${{ secrets.CLAUDE_CODE_OAUTH_TOKEN || secrets.CLAUDE_MAX_OAUTH_TOKEN }}
53-
track_progress: ${{ github.event_name == 'pull_request_target' }}
53+
track_progress: ${{ github.event_name == 'pull_request' }}
5454
prompt: |
5555
REPO: ${{ github.repository }}
5656
PR NUMBER: ${{ github.event.pull_request.number || github.event.issue.number }}

.github/workflows/iris-cloud-smoke-cw.yaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,9 @@ jobs:
7777
echo "${{ secrets.CW_KUBECONFIG }}" > ~/.kube/coreweave-iris
7878
chmod 600 ~/.kube/coreweave-iris
7979
80+
- name: Set up Docker Buildx
81+
uses: docker/setup-buildx-action@v3
82+
8083
- name: Log in to GitHub Container Registry
8184
uses: docker/login-action@v3
8285
with:

.github/workflows/iris-cloud-smoke-gcp.yaml

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,7 @@ jobs:
9191

9292
- name: Install Playwright system deps
9393
if: steps.playwright-cache.outputs.cache-hit == 'true'
94-
run: cd lib/iris && uv run playwright install-deps chromium
94+
run: npx --yes playwright@1.57.0 install-deps chromium
9595

9696
- name: Authenticate to Google Cloud
9797
uses: google-github-actions/auth@v2
@@ -109,6 +109,9 @@ jobs:
109109
echo "${{ secrets.MARIN_SSH_KEY }}" > ~/.ssh/marin_ray_cluster.pem
110110
chmod 600 ~/.ssh/marin_ray_cluster.pem
111111
112+
- name: Set up Docker Buildx
113+
uses: docker/setup-buildx-action@v3
114+
112115
- name: Log in to GitHub Container Registry
113116
uses: docker/login-action@v3
114117
with:

.github/workflows/iris-unit-tests.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,7 @@ jobs:
100100

101101
- name: Install Playwright system deps
102102
if: steps.playwright-cache.outputs.cache-hit == 'true'
103-
run: cd lib/iris && uv run playwright install-deps chromium
103+
run: npx --yes playwright@1.57.0 install-deps chromium
104104

105105
- name: Run E2E smoke tests
106106
env:

.github/workflows/marin-canary-ferry-cw.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,9 @@ jobs:
6464
echo "${{ secrets.CW_KUBECONFIG }}" > ~/.kube/coreweave-iris
6565
chmod 600 ~/.kube/coreweave-iris
6666
67+
- name: Set up Docker Buildx
68+
uses: docker/setup-buildx-action@v3
69+
6770
- name: Log in to GitHub Container Registry
6871
uses: docker/login-action@v3
6972
with:
@@ -94,6 +97,9 @@ jobs:
9497
-e WANDB_PROJECT "$WANDB_PROJECT" \
9598
-e WANDB_API_KEY "$WANDB_API_KEY" \
9699
-e HF_TOKEN "$HF_TOKEN" \
100+
-e AWS_ACCESS_KEY_ID "$R2_ACCESS_KEY_ID" \
101+
-e AWS_SECRET_ACCESS_KEY "$R2_SECRET_ACCESS_KEY" \
102+
-e AWS_ENDPOINT_URL "https://74981a43be0de7712369306c7b19133d.r2.cloudflarestorage.com" \
97103
-- python -m experiments.ferries.canary_ferry)
98104
echo "job_id=$JOB_ID" >> "$GITHUB_OUTPUT"
99105
echo "Submitted job: $JOB_ID"

.github/workflows/nightshift-cleanup.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,8 @@ jobs:
1717
contents: write
1818
pull-requests: write
1919
issues: write
20-
actions: read
2120
id-token: write
21+
actions: read
2222
steps:
2323
- name: Checkout repository
2424
uses: actions/checkout@v4

.github/workflows/nightshift-doc-drift.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,8 @@ jobs:
1717
contents: write
1818
pull-requests: write
1919
issues: write
20-
actions: read
2120
id-token: write
21+
actions: read
2222
steps:
2323
- name: Checkout repository
2424
uses: actions/checkout@v4

Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ test:
5252
RAY_ADDRESS= PYTHONPATH=tests:. pytest tests --durations=0 -n 4 --tb=no -v
5353

5454
# Define regions and tags for the Docker images
55-
CLUSTER_REPOS = us-central2 us-central1 europe-west4 us-west4 asia-northeast1 us-east5 us-east1
55+
CLUSTER_REPOS = us-central2 us-central1 europe-west4 us-west4 us-east5 us-east1
5656
TAG_DATE = $(shell date -u +"%Y%m%d")
5757
TAG_VERSIONS = latest $(shell git rev-parse --short HEAD) $(TAG_DATE)
5858

experiments/dedup/isoflop_analysis.py

Lines changed: 0 additions & 138 deletions
This file was deleted.

experiments/dedup/poc_nemotron.py

Lines changed: 106 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,106 @@
1+
# Copyright The Marin Authors
2+
# SPDX-License-Identifier: Apache-2.0
3+
4+
# Copyright 2025 The Marin Authors
5+
# SPDX-License-Identifier: Apache-2.0
6+
from iris.logging import configure_logging
7+
from marin.processing.classification.deduplication.fuzzy import dedup_fuzzy_document
8+
import os
9+
from typing import TypeVar
10+
from marin.execution.step_runner import StepRunner
11+
from marin.processing.classification.deduplication.exact import dedup_exact_paragraph
12+
from fray.v2 import ResourceConfig
13+
from iris.marin_fs import marin_temp_bucket, region_from_metadata, check_path_in_region
14+
from marin.execution.step_spec import StepSpec
15+
16+
import logging
17+
18+
logger = logging.getLogger(__name__)
19+
20+
T = TypeVar("T")
21+
22+
23+
def assert_not_none(value: T | None) -> T:
24+
assert value is not None
25+
return value
26+
27+
28+
def exect_dedup_steps_10BT() -> list[StepSpec]:
29+
raw_data_step = StepSpec(
30+
name="raw_fineweb_edu",
31+
override_output_path="gs://marin-eu-west4/raw/fineweb-edu-87f0914",
32+
)
33+
34+
# assert we are not reading cross-region
35+
check_path_in_region(raw_data_step.name, raw_data_step.output_path, assert_not_none(region_from_metadata()))
36+
37+
dedup_step = StepSpec(
38+
name="exact_dedup",
39+
output_path_prefix=marin_temp_bucket(ttl_days=1, prefix="rav"),
40+
deps=[raw_data_step],
41+
fn=lambda op: dedup_exact_paragraph(
42+
input_paths=os.path.join(raw_data_step.output_path, "sample/10BT"),
43+
output_path=op,
44+
max_parallelism=1024,
45+
),
46+
)
47+
48+
return [raw_data_step, dedup_step]
49+
50+
51+
def exact_dedup_steps() -> list[StepSpec]:
52+
raw_data_step = StepSpec(
53+
name="raw_nemotron",
54+
override_output_path="gs://marin-eu-west4/raw/nemotro-cc-eeb783/",
55+
)
56+
57+
# assert we are not reading cross-region
58+
check_path_in_region(raw_data_step.name, raw_data_step.output_path, assert_not_none(region_from_metadata()))
59+
60+
dedup_step = StepSpec(
61+
name="exact_dedup_high_medium_1",
62+
output_path_prefix=marin_temp_bucket(ttl_days=2, prefix="rav"),
63+
deps=[raw_data_step],
64+
fn=lambda op: dedup_exact_paragraph(
65+
input_paths=[
66+
os.path.join(raw_data_step.output_path, "contrib/Nemotron/Nemotron-CC/data-jsonl/quality=high"),
67+
os.path.join(raw_data_step.output_path, "contrib/Nemotron/Nemotron-CC/data-jsonl/quality=medium-high"),
68+
],
69+
output_path=op,
70+
max_parallelism=2048,
71+
),
72+
)
73+
74+
return [raw_data_step, dedup_step]
75+
76+
77+
def fuzzy_dedup_steps() -> list[StepSpec]:
78+
raw_data_step = StepSpec(
79+
name="raw_nemotron",
80+
override_output_path="gs://marin-eu-west4/raw/nemotro-cc-eeb783/",
81+
)
82+
83+
# assert we are not reading cross-region
84+
check_path_in_region(raw_data_step.name, raw_data_step.output_path, assert_not_none(region_from_metadata()))
85+
86+
dedup_step = StepSpec(
87+
name="fuzzy_dedup_full",
88+
output_path_prefix=marin_temp_bucket(ttl_days=2, prefix="rav"),
89+
deps=[raw_data_step],
90+
fn=lambda op: dedup_fuzzy_document(
91+
input_paths=[
92+
os.path.join(raw_data_step.output_path, "contrib/Nemotron/Nemotron-CC/data-jsonl/quality=high"),
93+
os.path.join(raw_data_step.output_path, "contrib/Nemotron/Nemotron-CC/data-jsonl/quality=medium-high"),
94+
],
95+
output_path=op,
96+
max_parallelism=2048,
97+
worker_resources=ResourceConfig(cpu=5, ram="32g", disk="5g"),
98+
),
99+
)
100+
101+
return [raw_data_step, dedup_step]
102+
103+
104+
if __name__ == "__main__":
105+
configure_logging(logging.INFO)
106+
StepRunner().run(exact_dedup_steps())

0 commit comments

Comments
 (0)