gke-labs
diff --git a/‎.github/workflows/build-and-push.yml‎
Lines changed: 6 additions & 1 deletion b/‎.github/workflows/build-and-push.yml‎
Lines changed: 6 additions & 1 deletion
diff --git a/‎.github/workflows/build-pr.yml‎
Lines changed: 6 additions & 1 deletion b/‎.github/workflows/build-pr.yml‎
Lines changed: 6 additions & 1 deletion
diff --git a/‎examples/.dockerignore‎
Lines changed: 6 additions & 0 deletions b/‎examples/.dockerignore‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎examples/README.md‎
Lines changed: 1 addition & 0 deletions b/‎examples/README.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎examples/autoresearch/Dockerfile‎
Lines changed: 39 additions & 0 deletions b/‎examples/autoresearch/Dockerfile‎
Lines changed: 39 additions & 0 deletions
diff --git a/‎examples/autoresearch/README.md‎
Lines changed: 180 additions & 0 deletions b/‎examples/autoresearch/README.md‎
Lines changed: 180 additions & 0 deletions
diff --git a/‎examples/autoresearch/arch.png‎
706 KB b/‎examples/autoresearch/arch.png‎
706 KB
diff --git a/‎examples/autoresearch/cleanup_research_session.sh‎
Lines changed: 17 additions & 0 deletions b/‎examples/autoresearch/cleanup_research_session.sh‎
Lines changed: 17 additions & 0 deletions
diff --git a/‎examples/autoresearch/event_log.py‎
Lines changed: 51 additions & 0 deletions b/‎examples/autoresearch/event_log.py‎
Lines changed: 51 additions & 0 deletions
diff --git a/‎examples/autoresearch/k8s/base/kustomization.yaml‎
Lines changed: 5 additions & 0 deletions b/‎examples/autoresearch/k8s/base/kustomization.yaml‎
Lines changed: 5 additions & 0 deletions
@@ -13,9 +13,14 @@ jobs:
       matrix:
         include:
           - image_name: gateway
+            context: src/server
             dockerfile: src/server/Dockerfile.gateway
           - image_name: server
+            context: src/server
             dockerfile: src/server/Dockerfile
+          - image_name: client
+            context: examples
+            dockerfile: examples/autoresearch/Dockerfile
     permissions:
       contents: read
       packages: write
@@ -36,7 +41,7 @@ jobs:
       - name: Build and push
         uses: docker/build-push-action@v5
         with:
-          context: src/server
+          context: ${{ matrix.context }}
           file: ${{ matrix.dockerfile }}
           build-args: |
             VERSION=${{ github.sha }}
 
@@ -10,9 +10,14 @@ jobs:
       matrix:
         include:
           - image_name: gateway
+            context: src/server
             dockerfile: src/server/Dockerfile.gateway
           - image_name: server
+            context: src/server
             dockerfile: src/server/Dockerfile
+          - image_name: client
+            context: examples
+            dockerfile: examples/autoresearch/Dockerfile
     permissions:
       contents: read
     steps:
@@ -25,7 +30,7 @@ jobs:
       - name: Build
         uses: docker/build-push-action@v5
         with:
-          context: src/server
+          context: ${{ matrix.context }}
           file: ${{ matrix.dockerfile }}
           build-args: |
             VERSION=${{ github.sha }}
 
@@ -0,0 +1,6 @@
+*
+!README.md
+!pyproject.toml
+!uv.lock
+!autoresearch/
+!autoresearch/**
@@ -22,6 +22,7 @@ This directory contains examples, demos, and helper scripts for using the Open-R
 
 ### Reinforcement Learning (RL)
 * **[Text-to-SQL RL](rl/text-to-sql):** Runs the Gemma 4 SFT+RL recipe with SQL execution rewards and curve plotting.
+* **[Autoresearch Demo](autoresearch):** Runs code-RL researchers against the same Open-RL gateway using cookbook DeepCoder rewards, Sandbox Fusion, and optional Agent Sandbox CRDs.
 
 ### Tinker Cookbook
 * **[Tinker Cookbook Recipes](tinker-cookbook):** Examples showing how to run [Tinker Cookbook](https://github.com/thinking-machines-lab/tinker-cookbook) recipes with Open-RL.
 
@@ -0,0 +1,39 @@
+FROM python:3.12-slim
+
+ENV PYTHONUNBUFFERED=1 \
+    PYTHONDONTWRITEBYTECODE=1 \
+    DEBIAN_FRONTEND=noninteractive \
+    UV_COMPILE_BYTECODE=1 \
+    UV_LINK_MODE=copy
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    ca-certificates \
+    curl \
+    git \
+    nodejs \
+    npm \
+    && rm -rf /var/lib/apt/lists/*
+
+COPY --from=ghcr.io/astral-sh/uv:latest /uv /uv/bin/uv
+ENV PATH="/uv/bin:$PATH"
+
+RUN npm install -g @google/gemini-cli
+
+WORKDIR /app
+
+COPY pyproject.toml uv.lock README.md ./
+COPY autoresearch ./autoresearch
+
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv sync --frozen
+
+WORKDIR /app/autoresearch
+
+RUN git init \
+    && git config user.email "agent@open-rl.local" \
+    && git config user.name "Autoresearch Agent" \
+    && printf ".venv/\\n__pycache__/\\n*.pyc\\nopen_rl_client.egg-info/\\n" > .gitignore \
+    && git add . \
+    && git commit -m "Initial autoresearch workspace"
+
+CMD ["./run_research_agent.sh"]
@@ -0,0 +1,180 @@
+# Open-RL Autoresearch Demo
+
+This adapts [Karpathy's autoresearch](https://github.com/karpathy/autoresearch)
+to Open-RL: an agent repeatedly edits one allowed target, runs a bounded
+measured attempt, keeps commits that improve the configured metric, and resets
+the rest. The same recipe contract works locally or in Kubernetes; in a cluster,
+each run can live in its own pod and act as a researcher while sharing the same
+storage and Open-RL backend.
+
+## Minimal Recipe Shape
+
+An autoresearch task needs three recipe-owned things:
+
+```text
+<recipe>/
+  program.md          # instructions for the agent
+  autoresearch.toml   # command, editable files, and graphed metric
+  thing_to_edit.py    # often also the command target declared in TOML
+```
+
+`program.md` tells the agent what to edit, what metric matters, what files are
+off-limits, and how to decide keep/reset.
+
+`autoresearch.toml` is the harness contract. It says how to run one attempt,
+which files the agent may edit, and which metric decides whether the attempt
+improved:
+
+```toml
+task = "my_task"
+command = "uv run recipe.py run_dir={run_dir} attempt_timeout_minutes={attempt_timeout_minutes}"
+editable = ["thing_to_edit.py"]
+metric = "accuracy"
+metric_label = "accuracy"
+metric_mode = "max"
+```
+
+The command can be any runnable benchmark or training loop. It just needs to:
+
+- accept the args used in `command`, usually at least `run_dir`
+- write attempt artifacts under `run_dir`
+- exit nonzero on failure
+- log the configured metric to `run_dir/metrics.jsonl`
+
+```python
+ml_logger.log_metrics({"accuracy": 0.73}, step=1)
+```
+
+Use the cookbook `ml_logger` for this; the shared harness treats
+`metrics.jsonl` as the only metric source. The command target does not need a
+special filename; it can be the editable recipe file itself, a fixed runner, or
+`bash run.sh`.
+
+## Included Recipes
+
+Both recipes use the same `program.md` + `autoresearch.toml` contract:
+
+| Recipe | Command Target | Editable | Metric | Guide |
+| --- | --- | --- | --- | --- |
+| Text-SQL | `recipes.text_sql.train` | `train.py` | `accuracy` | [Text-SQL](recipes/text_sql/README.md) |
+| Math-RL | `recipes.math_rl.train` | `config.toml` | `accuracy` | [Math-RL](recipes/math_rl/README.md) |
+
+Use the recipe guides for local one-attempt runs, local UI serving, and
+recipe-specific settings.
+
+## Architecture
+
+Autoresearch runs as a small Kubernetes add-on around the shared Open-RL
+infrastructure. A recipe overlay starts the UI plus one researcher Sandbox per
+researcher. Each Sandbox runs Gemini CLI, edits the recipe, launches attempts,
+and calls the shared Open-RL/Tinker services.
+
+![Autoresearch architecture](arch.png)
+
+## Cluster Run
+
+Create the API secret for agent-backed researcher pods:
+
+```bash
+kubectl create secret generic researcher-agent-secrets \
+  --from-literal=GEMINI_API_KEY="${GEMINI_API_KEY}"
+```
+
+Choose one recipe overlay:
+
+```bash
+# Fast text-SQL, no model server.
+kubectl apply -k examples/autoresearch/recipes/text_sql
+
+# Math-RL add-on. First deploy Open-RL with docs/setup/gke-setup.md,
+# or reuse an existing backend at http://open-rl-gateway-service:8000.
+kubectl apply -k examples/autoresearch/recipes/math_rl
+
+# Convenience one-shot Math-RL stack: Open-RL backend + autoresearch add-on.
+kubectl apply -k examples/autoresearch/recipes/math_rl/gke
+```
+
+Each overlay starts one Sandbox that runs one Gemini CLI researcher. If that
+process exits nonzero or the pod crashes, the run stops; Kubernetes does not
+retry it. The intended recovery is to inspect the UI/logs and start a new run
+explicitly.
+
+Open the UI:
+
+```bash
+kubectl port-forward svc/open-rl-autoresearch-ui 8080:8080
+```
+
+```text
+http://localhost:8080/experiments.html
+```
+
+Use the normal [GKE setup guide](../../docs/setup/gke-setup.md) for cluster,
+GPU, storage, and the Open-RL backend. These overlays add researcher sandboxes and
+the UI on top of that shared backend.
+
+Researcher pods wait for comma-separated `READY_URLS` before the agent starts, so
+early pod startup does not race vLLM, the trainer worker, or the gateway. The
+convenience Math-RL stack sets those URLs for vLLM, trainer, and gateway health.
+
+## Shared Pieces
+
+```text
+run_research_agent.sh  # launches one timeout-bounded agent
+run_attempt.py         # runs one measured attempt and records UI events
+ui/observer.py         # read-only UI server over recorded events
+k8s/base/              # reusable Sandbox/UI resources
+```
+
+`run_attempt.py` runs recipe code and writes artifacts. The UI reads only
+`LOG_ROOT/*/ui_events.jsonl`; clearing `LOG_ROOT` resets attempts, live rows,
+and per-attempt agent-log cursors.
+
+The launcher passes the recipe-adjacent `program.md` to Gemini as the prompt.
+That program tells the agent to edit only the declared target, commit the
+attempt, run `eval "${RUN_ATTEMPT_COMMAND}"`, record the metric, and reset if
+the metric did not improve.
+
+## Adding A Recipe
+
+Copy one existing recipe directory and update:
+
+- `program.md`
+- `autoresearch.toml`
+- the command target, if you keep one
+- the editable target
+- `kustomization.yaml` settings: `RECIPE`, `LOG_ROOT`, and
+  `ATTEMPT_TIMEOUT_MINUTES`
+- optionally `AGENT_TIMEOUT_MINUTES`, if the researcher pod should stop before
+  Kubernetes cleanup does
+- optionally `READY_URLS`, if attempts need external services to be healthy
+  before the agent starts
+
+The shared wrapper handles logs, diffs, metrics, status, and UI events. Recipe
+code should focus on running the benchmark or training loop and emitting the
+metric.
+
+## Timeouts And Cleanup
+
+`ATTEMPT_TIMEOUT_MINUTES` caps one measured training/eval run. Every attempt gets
+the same value, so scores are comparable.
+
+`AGENT_TIMEOUT_MINUTES` caps the outer Gemini process. One agent can run several
+attempts inside this window: run the default config, edit, commit, run attempt,
+decide keep/reset, then repeat. Setup happens before this clock starts.
+
+Clean up a session:
+
+```bash
+OVERLAY=examples/autoresearch/recipes/text_sql \
+  examples/autoresearch/cleanup_research_session.sh
+```
+
+To also clear shared run data:
+
+```bash
+DELETE_ARTIFACTS=1 \
+LOG_ROOT=/mnt/shared/open-rl/autoresearch/text_sql \
+OVERLAY=examples/autoresearch/recipes/text_sql \
+  examples/autoresearch/cleanup_research_session.sh
+```
@@ -0,0 +1,17 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+OVERLAY="${OVERLAY:-examples/autoresearch/recipes/text_sql}"
+NAMESPACE="${NAMESPACE:-default}"
+DELETE_ARTIFACTS="${DELETE_ARTIFACTS:-0}"
+LOG_ROOT="${LOG_ROOT:-}"
+
+kubectl -n "${NAMESPACE}" delete -k "${OVERLAY}" --ignore-not-found=true
+
+if [ "${DELETE_ARTIFACTS}" = "1" ]; then
+  if [ -z "${LOG_ROOT}" ]; then
+    echo "DELETE_ARTIFACTS=1 requires LOG_ROOT" >&2
+    exit 2
+  fi
+  rm -rf "${LOG_ROOT}"
+fi
@@ -0,0 +1,51 @@
+"""Shared JSONL event writer for autoresearch UI state."""
+
+from __future__ import annotations
+
+import argparse
+import json
+import time
+from pathlib import Path
+from typing import Any
+
+UI_EVENTS_FILE = "ui_events.jsonl"
+
+
+def append_ui_events(event_dir: Path, events: list[dict[str, Any]]) -> None:
+  path = event_dir / UI_EVENTS_FILE
+  path.parent.mkdir(parents=True, exist_ok=True)
+  now = time.time()
+  with path.open("a", encoding="utf-8") as f:
+    for event in events:
+      f.write(json.dumps({"time": now, **event}, sort_keys=True) + "\n")
+
+
+def activity_events(args: argparse.Namespace) -> list[dict[str, Any]]:
+  base = {
+    "attempt_timeout_minutes": args.attempt_timeout_minutes,
+    "kind": "activity",
+    "order": 0,
+    "researcher": args.researcher,
+    "status": args.status,
+    "work_id": f"{args.researcher}-activity",
+  }
+  return [
+    {**base, "tab": "agent", "path": args.agent_log},
+    {**base, "tab": "notes", "path": args.notes},
+  ]
+
+
+def main(argv: list[str] | None = None) -> None:
+  parser = argparse.ArgumentParser()
+  parser.add_argument("--event-dir", type=Path, required=True)
+  parser.add_argument("--researcher", required=True)
+  parser.add_argument("--status", required=True)
+  parser.add_argument("--attempt-timeout-minutes", type=float, required=True)
+  parser.add_argument("--agent-log", required=True)
+  parser.add_argument("--notes", required=True)
+  args = parser.parse_args(argv)
+  append_ui_events(args.event_dir, activity_events(args))
+
+
+if __name__ == "__main__":
+  main()
@@ -0,0 +1,5 @@
+resources:
+- resources.yaml
+
+configurations:
+- kustomizeconfig.yaml