marin-community
diff --git a/‎.agents/skills/fix-docs/SKILL.md‎
Lines changed: 5 additions & 0 deletions b/‎.agents/skills/fix-docs/SKILL.md‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎.github/workflows/iris-coreweave-ci.yaml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/iris-coreweave-ci.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/iris-dev-restart.yaml‎
Lines changed: 2 additions & 2 deletions b/‎.github/workflows/iris-dev-restart.yaml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎.github/workflows/marin-canary-ferry-cw.yaml‎
Lines changed: 3 additions & 3 deletions b/‎.github/workflows/marin-canary-ferry-cw.yaml‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎.github/workflows/marin-canary-ferry.yaml‎
Lines changed: 2 additions & 2 deletions b/‎.github/workflows/marin-canary-ferry.yaml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎.github/workflows/marin-datakit-smoke.yaml‎
Lines changed: 18 additions & 5 deletions b/‎.github/workflows/marin-datakit-smoke.yaml‎
Lines changed: 18 additions & 5 deletions
diff --git a/‎.github/workflows/marin-libs-wheels.yaml‎
Lines changed: 100 additions & 0 deletions b/‎.github/workflows/marin-libs-wheels.yaml‎
Lines changed: 100 additions & 0 deletions
diff --git a/‎experiments/defaults.py‎
Lines changed: 15 additions & 1 deletion b/‎experiments/defaults.py‎
Lines changed: 15 additions & 1 deletion
diff --git a/‎experiments/ferries/datakit_ferry.py‎
Lines changed: 19 additions & 2 deletions b/‎experiments/ferries/datakit_ferry.py‎
Lines changed: 19 additions & 2 deletions
diff --git a/‎experiments/pretraining_datasets/__init__.py‎
Lines changed: 9 additions & 0 deletions b/‎experiments/pretraining_datasets/__init__.py‎
Lines changed: 9 additions & 0 deletions
@@ -1,3 +1,8 @@
+---
+name: fix-docs
+description: Fix markdown docs in `lib/iris`, `lib/zephyr`, and `lib/fray` to align with Marin's agent-doc principles. Use when asked to repair, modernize, or de-rot docs in those directories.
+---
+
 Your task is to fix the markdown docs within `lib/iris`, `lib/zephyr` and `lib/fray` so that they maximally comply with the principles below. Do NOT fix docs outside of the aforementioned directories.
 
 Your output: You will dispatch sub-agents that will (1) thoroughly parse the code and the docs and (2) make all the documentation changes that are deemed appropriate, locally. You will commit the changes locally into a single commit, inform the user of the commit, and summarize the changes you made. Under no circumstances should you push any commit to the repo without explicit approval from the user.
 
@@ -104,7 +104,7 @@ jobs:
         run: |
           cd lib/iris && uv run --group dev iris -v \
             --config=examples/coreweave-ci.yaml \
-            cluster start
+            cluster start --fresh
 
       - name: Run integration tests
         env:
 
@@ -2,8 +2,8 @@ name: Iris - Dev Cluster Daily Restart
 
 on:
   schedule:
-    # Daily at 06:00 UTC
-    - cron: "0 6 * * *"
+    # Daily at 05:00 UTC — staggered before canary ferry (06:00 UTC)
+    - cron: "0 5 * * *"
   workflow_dispatch:
 
 permissions:
 
@@ -58,7 +58,7 @@ jobs:
           enable-cache: true
 
       - name: Install dependencies
-        run: uv sync --all-packages --extra=cpu --no-default-groups
+        run: uv sync --all-packages --extra=cpu --extra=controller --no-default-groups
 
       - name: Write CoreWeave kubeconfig
         run: |
@@ -89,7 +89,7 @@ jobs:
         run: |
           JOB_ID=$(.venv/bin/iris --config=${{ env.IRIS_CONFIG }} \
             job run --no-wait \
-            --memory=16G --disk=16G --cpu=1 --extra=cpu \
+            --memory=2G --disk=4G --cpu=1 --extra=cpu \
             -e MARIN_PREFIX s3://marin-na/marin/ \
             -e RUN_ID "$RUN_ID" \
             -e CANARY_ACCELERATOR "$CANARY_ACCELERATOR" \
@@ -195,7 +195,7 @@ jobs:
             Read .agents/skills/canary-triage/SKILL.md and follow it.
           claude_args: |
             --model opus
-            --max-turns 50
+            --max-turns 500
             --allowedTools "Bash(kubectl:*),Bash(gh:*),Bash(.venv/bin/iris:*),Bash(.venv/bin/python:*),Bash(cat:*),Bash(jq:*),Bash(head:*),Bash(tail:*),Bash(grep:*)"
         env:
           CANARY_LANE: gpu
 
@@ -78,7 +78,7 @@ jobs:
         run: |
           JOB_ID=$(.venv/bin/iris --config=${{ env.IRIS_CONFIG }} \
             job run --no-wait \
-            --memory=16G --disk=16G --cpu=1 --extra=cpu \
+            --memory=2G --disk=4G --cpu=1 --extra=cpu \
             --reserve v5p-8 \
             -e RUN_ID "$RUN_ID" \
             -e CANARY_ACCELERATOR "$CANARY_ACCELERATOR" \
@@ -165,7 +165,7 @@ jobs:
             Read .agents/skills/canary-triage/SKILL.md and follow it.
           claude_args: |
             --model opus
-            --max-turns 50
+            --max-turns 500
             --allowedTools "Bash(gh:*),Bash(.venv/bin/iris:*),Bash(.venv/bin/python:*),Bash(cat:*),Bash(jq:*),Bash(head:*),Bash(tail:*),Bash(grep:*)"
         env:
           CANARY_LANE: tpu
 
@@ -19,7 +19,7 @@ jobs:
       cancel-in-progress: true
     env:
       SMOKE_RUN_ID: datakit-smoke-${{ github.run_id }}-${{ github.run_attempt }}
-      # MARIN_PREFIX is defaulted by the ferry entrypoint to marin_temp_bucket(ttl_days=1).
+      FERRY_STATUS_PATH: gs://marin-tmp-us-central1/ttl=1d/ci/datakit-smoke-${{ github.run_id }}-${{ github.run_attempt }}/ferry_run_status.json
       WANDB_ENTITY: marin-community
       WANDB_PROJECT: marin
       IRIS_CONFIG: lib/iris/examples/marin-dev.yaml
@@ -70,6 +70,7 @@ jobs:
             job run --no-wait \
             --memory=2G --disk=4G --cpu=1 --extra=cpu \
             -e SMOKE_RUN_ID "$SMOKE_RUN_ID" \
+            -e FERRY_STATUS_PATH "$FERRY_STATUS_PATH" \
             -e WANDB_ENTITY "$WANDB_ENTITY" \
             -e WANDB_PROJECT "$WANDB_PROJECT" \
             -e WANDB_API_KEY "$WANDB_API_KEY" \
@@ -113,12 +114,24 @@ jobs:
             esac
           done
 
+      - name: Read ferry status
+        id: ferry_status
+        shell: bash -l {0}
+        run: |
+          PREFIX=$(.venv/bin/python -c "
+          import json
+          from rigging.filesystem import url_to_fs
+          fs, _ = url_to_fs('$FERRY_STATUS_PATH')
+          with fs.open('$FERRY_STATUS_PATH') as f:
+              print(json.load(f)['marin_prefix'])
+          ")
+          echo "marin_prefix=$PREFIX" >> "$GITHUB_OUTPUT"
+          echo "Ferry output prefix: $PREFIX"
+
       - name: Validate datakit smoke outputs
         shell: bash -l {0}
         env:
-          SMOKE_RUN_ID: ${{ env.SMOKE_RUN_ID }}
-          # MARIN_PREFIX intentionally unset — validate script defaults via marin_temp_bucket,
-          # matching the ferry entrypoint default.
+          MARIN_PREFIX: ${{ steps.ferry_status.outputs.marin_prefix }}
         run: .venv/bin/python scripts/datakit/validate_ferry_outputs.py
 
       - name: Capture failure diagnostics
@@ -143,7 +156,7 @@ jobs:
             Read .agents/skills/canary-triage/SKILL.md and follow it.
           claude_args: |
             --model opus
-            --max-turns 50
+            --max-turns 500
             --allowedTools "Bash(gh:*),Bash(.venv/bin/iris:*),Bash(.venv/bin/python:*),Bash(cat:*),Bash(jq:*),Bash(head:*),Bash(tail:*),Bash(grep:*)"
         env:
           CANARY_LANE: datakit-smoke
 
@@ -0,0 +1,100 @@
+name: marin-libs - Build Wheels
+
+on:
+  workflow_dispatch:
+    inputs:
+      mode:
+        description: "Build mode"
+        type: choice
+        options: [nightly, manual]
+        default: manual
+  schedule:
+    - cron: "0 6 * * *"   # 06:00 UTC daily
+  push:
+    tags:
+      - "marin-libs-v*"
+  pull_request:
+    paths:
+      - "lib/**"
+      - "scripts/python_libs_package.py"
+      - ".github/workflows/marin-libs-wheels.yaml"
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: false  # don't kill an in-flight nightly mid-publish
+
+permissions:
+  contents: write   # creating GH releases
+  pull-requests: read
+
+jobs:
+  resolve:
+    runs-on: ubuntu-latest
+    outputs:
+      mode: ${{ steps.pick.outputs.mode }}
+      version: ${{ steps.pick.outputs.version }}
+    steps:
+      - id: pick
+        run: |
+          set -euo pipefail
+          if [[ "${GITHUB_EVENT_NAME}" == "push" && "${GITHUB_REF}" == refs/tags/marin-libs-v* ]]; then
+            echo "mode=stable" >> "$GITHUB_OUTPUT"
+            echo "version=${GITHUB_REF_NAME#marin-libs-v}" >> "$GITHUB_OUTPUT"
+          elif [[ "${GITHUB_EVENT_NAME}" == "schedule" ]]; then
+            echo "mode=nightly" >> "$GITHUB_OUTPUT"
+            echo "version=" >> "$GITHUB_OUTPUT"
+          elif [[ "${GITHUB_EVENT_NAME}" == "workflow_dispatch" ]]; then
+            echo "mode=${{ github.event.inputs.mode }}" >> "$GITHUB_OUTPUT"
+            echo "version=" >> "$GITHUB_OUTPUT"
+          else
+            # pull_request: build-only smoke test
+            echo "mode=manual" >> "$GITHUB_OUTPUT"
+            echo "version=" >> "$GITHUB_OUTPUT"
+          fi
+
+  build:
+    needs: resolve
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          fetch-depth: 0  # for git rev-parse in manual mode
+      - uses: astral-sh/setup-uv@v7
+
+      - name: Build wheels
+        run: |
+          uv run python scripts/python_libs_package.py \
+            --mode "${{ needs.resolve.outputs.mode }}" \
+            ${{ needs.resolve.outputs.version && format('--version {0}', needs.resolve.outputs.version) || '' }} \
+            --skip-publish
+
+      - uses: actions/upload-artifact@v4
+        with:
+          name: marin-libs-wheels
+          # BUILD_INFO.json travels with the wheels so the publish job uses
+          # the same resolved version the build job stamped in, instead of
+          # re-computing it (which would drift across midnight UTC).
+          path: |
+            dist/*.whl
+            dist/BUILD_INFO.json
+          retention-days: 14
+
+  publish:
+    needs: [resolve, build]
+    if: github.event_name != 'pull_request'
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - uses: astral-sh/setup-uv@v7
+      - uses: actions/download-artifact@v4
+        with:
+          name: marin-libs-wheels
+          path: dist
+      - name: Publish releases and prune nightlies
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        run: |
+          uv run python scripts/python_libs_package.py \
+            --mode "${{ needs.resolve.outputs.mode }}" \
+            ${{ needs.resolve.outputs.version && format('--version {0}', needs.resolve.outputs.version) || '' }} \
+            --publish-only
@@ -206,6 +206,9 @@ def default_tokenize(
     *,
     sample_count: int | VersionedValue[int] | None = None,
     is_validation: bool = False,
+    levanter_batch_size: int | None = None,
+    resources: ResourceConfig | None = None,
+    worker_resources: ResourceConfig | None = None,
 ) -> ExecutorStep:
     """
     Tokenizes a dataset using the specified tokenizer and Levanter's tokenization infrastructure.
@@ -228,6 +231,11 @@ def default_tokenize(
         An ExecutorStep that represents the tokenized dataset.
     """
 
+    # Common kwargs for config constructors
+    extra_kwargs: dict = {}
+    if worker_resources is not None:
+        extra_kwargs["worker_resources"] = worker_resources
+
     # sniff out if it's a HuggingFace dataset
     if isinstance(dataset, HfDatasetSpec):
         config = HfTokenizeConfig(
@@ -237,6 +245,8 @@ def default_tokenize(
             tokenizer=ensure_versioned(tokenizer),
             format=format,
             sample_count=ensure_versioned(sample_count) if sample_count is not None else None,
+            levanter_batch_size=levanter_batch_size,
+            **extra_kwargs,
         )
     elif (
         isinstance(dataset, str)
@@ -250,6 +260,8 @@ def default_tokenize(
             tokenizer=ensure_versioned(tokenizer),
             format=format,
             sample_count=ensure_versioned(sample_count) if sample_count is not None else None,
+            levanter_batch_size=levanter_batch_size,
+            **extra_kwargs,
         )
     else:
         config = TokenizeConfig(
@@ -259,14 +271,16 @@ def default_tokenize(
             tokenizer=ensure_versioned(tokenizer),
             format=format,
             sample_count=ensure_versioned(sample_count) if sample_count is not None else None,
+            levanter_batch_size=levanter_batch_size,
+            **extra_kwargs,
         )
 
     return ExecutorStep(
         name=os.path.join("tokenized", name),
         description=f"Tokenize raw text using the {tokenizer} tokenizer.",
         fn=remote(
             tokenize,
-            resources=ResourceConfig.with_cpu(cpu=4, ram="16g", disk="10g"),
+            resources=resources or ResourceConfig.with_cpu(cpu=4, ram="16g", disk="10g"),
             pip_dependency_groups=["cpu"],
             env_vars={
                 "TRANSFORMERS_NO_TORCH": "1",
 
@@ -7,10 +7,11 @@
 Output paths are placed under ``$MARIN_PREFIX/datakit-smoke/$SMOKE_RUN_ID/...``.
 """
 
+import json
 import logging
 import os
 
-from rigging.filesystem import marin_temp_bucket
+from rigging.filesystem import marin_temp_bucket, url_to_fs
 from rigging.log_setup import configure_logging
 
 from fray import ResourceConfig
@@ -109,14 +110,30 @@ def build_steps(run_id: str) -> list[StepSpec]:
     return [downloaded, normalized, deduped, consolidated, tokenized]
 
 
+def _write_status(status: str, marin_prefix: str) -> None:
+    """Write ferry run status to FERRY_STATUS_PATH if set."""
+    status_path = os.environ.get("FERRY_STATUS_PATH")
+    if not status_path:
+        return
+    payload = json.dumps({"status": status, "marin_prefix": marin_prefix})
+    fs, _ = url_to_fs(status_path)
+    with fs.open(status_path, "w") as f:
+        f.write(payload)
+    logger.info("Wrote ferry status to %s", status_path)
+
+
 def main() -> None:
     configure_logging()
     if not os.environ.get("MARIN_PREFIX"):
         os.environ["MARIN_PREFIX"] = marin_temp_bucket(ttl_days=1)
 
-    logger.info("MARIN_PREFIX defaulted to %s", os.environ["MARIN_PREFIX"])
+    marin_prefix = os.environ["MARIN_PREFIX"]
+    logger.info("MARIN_PREFIX defaulted to %s", marin_prefix)
     run_id = os.environ["SMOKE_RUN_ID"]
+
+    _write_status("running", marin_prefix)
     StepRunner().run(build_steps(run_id))
+    _write_status("succeeded", marin_prefix)
 
 
 if __name__ == "__main__":
 
@@ -48,6 +48,10 @@
     downloads as nemotron_v2_downloads,
     tokenize_nemotron_v2_family,
 )
+from experiments.pretraining_datasets.common_corpus import (
+    common_corpus_download,
+    tokenize_common_corpus,
+)
 from experiments.pretraining_datasets.nsf_awards import (
     nsf_awards_download,
     nsf_awards_tokenized,
@@ -117,6 +121,11 @@
         "download": dolmino_downloads["dolmino"],
         "tokenize_fn": lambda: {"dolmino_math/all": tokenize_dolmino_math()},
     },
+    "common_corpus": {
+        "subsets": ["all"],
+        "download": common_corpus_download,
+        "tokenize_fn": lambda: {"common_corpus/all": tokenize_common_corpus()},
+    },
     "nsf_awards": {
         "subsets": ["all"],
         "download": nsf_awards_download,