diff --git a/.claude/settings.json b/.claude/settings.json
new file mode 100644
index 000000000..16253061d
--- /dev/null
+++ b/.claude/settings.json
@@ -0,0 +1,15 @@
+{
+  "hooks": {
+    "PostToolUse": [
+      {
+        "matcher": "Edit|Write",
+        "hooks": [
+          {
+            "type": "command",
+            "command": "if [[ \"$TOOL_INPUT\" == *\".go\"* ]]; then ./bin/openshift-goimports 2>/dev/null; fi"
+          }
+        ]
+      }
+    ]
+  }
+}
\ No newline at end of file
diff --git a/.claude/skills/add-benchmark/SKILL.md b/.claude/skills/add-benchmark/SKILL.md
new file mode 100644
index 000000000..4cf83a786
--- /dev/null
+++ b/.claude/skills/add-benchmark/SKILL.md
@@ -0,0 +1,147 @@
+# Add Benchmark
+
+Guide for adding a new benchmark to `benchmarks/` in the distributed-workloads repo.
+
+## Directory layout
+
+Each benchmark lives in its own subdirectory under `benchmarks/`:
+
+```
+benchmarks/<benchmark-name>/
+  Dockerfile              # Multi-stage build for the benchmark image
+  Dockerfile.cuda         # (optional) CUDA variant
+  mpi-runtime.yaml        # ClusterTrainingRuntime defining the MPI execution environment
+  trainjob.yaml           # TrainJob manifest to submit the benchmark
+  README.md               # Documentation (what, files, quick start, parameters, output)
+  <scripts>               # (optional) Training/benchmark scripts mounted via ConfigMap
+```
+
+See `benchmarks/osu-benchmarks/` and `benchmarks/kftv2-mpi-ddp-sft/` as reference implementations.
+
+## Dockerfile conventions
+
+Follow the multi-stage build pattern used in `benchmarks/osu-benchmarks/Dockerfile`:
+
+1. **Stage 1 (builder)** - compile dependencies from source (e.g., OpenMPI, benchmark binaries)
+2. **Stage 2 (runtime)** - copy built artifacts, configure SSH for MPI, set up the runtime environment
+
+Key requirements:
+- Base image from `quay.io/opendatahub/` or `quay.io/modh/`
+- `USER 0` only during build stages; final image must use `USER 1001`
+- OpenShift GID 0 pattern: `chgrp -R 0 <dir> && chmod -R g=u <dir>`
+- Allow random UID: `chmod g=u /etc/passwd`
+- SSH authentication via Training Operator's `sshAuthMountPath` -- keys are auto-injected at the path specified in the ClusterTrainingRuntime, not baked into the image. Workers generate host keys at startup.
+- For CUDA variants, create a separate `Dockerfile.cuda` extending the base
+
+## ClusterTrainingRuntime
+
+Define a `ClusterTrainingRuntime` resource with MPI configuration. Key fields:
+
+```yaml
+apiVersion: trainer.kubeflow.org/v1alpha1
+kind: ClusterTrainingRuntime
+metadata:
+  name: <runtime-name>
+spec:
+  mlPolicy:
+    mpi:
+      mpiImplementation: OpenMPI
+      sshAuthMountPath: /tmp/ssh
+  template:
+    spec:
+      replicatedJobs:
+        - name: launcher
+          replicas: 1
+          template: ...
+        - name: worker
+          replicas: <N>
+          template: ...
+```
+
+- Launcher: runs the benchmark command (mpirun/mpiexec)
+- Workers: run sshd and wait for MPI connections
+- Both need the SSH setup commands in their entrypoints
+
+See `benchmarks/osu-benchmarks/mpi-runtime-cpu.yaml` for a complete example.
+
+## TrainJob
+
+Submit benchmarks using a `TrainJob` with `generateName` (not fixed `name`):
+
+```yaml
+apiVersion: trainer.kubeflow.org/v1alpha1
+kind: TrainJob
+metadata:
+  generateName: <benchmark-name>-
+  namespace: <namespace>
+spec:
+  runtimeRef:
+    apiGroup: trainer.kubeflow.org
+    kind: ClusterTrainingRuntime
+    name: <runtime-name>
+  trainer:
+    numNodes: 2
+    resourcesPerNode:
+      requests:
+        nvidia.com/gpu: "2"
+    env:
+      - name: PARAM_NAME
+        value: "value"
+```
+
+Use `trainer.env` for benchmark parameters - the controller injects them into all pod containers.
+
+See `benchmarks/kftv2-mpi-ddp-sft/trainjob.yaml` for a complete example.
+
+## Makefile targets
+
+Add build/push targets to the root `Makefile` following the existing pattern:
+
+```makefile
+BENCHMARK_VERSION ?= latest
+
+.PHONY: build-<name>-benchmark-image
+build-<name>-benchmark-image:
+	$(CONTAINER_ENGINE) build -t quay.io/modh/distributed-workloads-benchmark:trainer-mpi-<name>-$(BENCHMARK_VERSION) \
+	  -f benchmarks/<name>/Dockerfile benchmarks/<name>/
+
+.PHONY: push-<name>-benchmark-image
+push-<name>-benchmark-image:
+	$(CONTAINER_ENGINE) push quay.io/modh/distributed-workloads-benchmark:trainer-mpi-<name>-$(BENCHMARK_VERSION)
+```
+
+Registry: `quay.io/modh/distributed-workloads-benchmark`
+Tag format: `trainer-mpi-<name>-<version>`
+
+## CI workflow
+
+Create `.github/workflows/build-and-push-<name>-benchmark.yml` matching the structure in `build-and-push-osu-benchmark.yml`:
+
+- Trigger on push/PR when files under `benchmarks/<name>/` change
+- Build on all branches, push only on `main`
+- Use `docker/build-push-action` with appropriate Dockerfile path
+
+## README
+
+Every benchmark must include a `README.md` with these sections (see `benchmarks/kftv2-mpi-ddp-sft/README.md`):
+
+| Section | Content |
+|---------|---------|
+| Title + summary | One-line description of what the benchmark measures |
+| What this benchmark does | Table with algorithm, model, dataset, backend, runtime, image |
+| Files | Table mapping each file to its purpose |
+| Quick start | Numbered steps: deploy runtime, create namespace/ConfigMap, submit TrainJob, monitor |
+| Scaling | Table showing node/GPU configurations |
+| Benchmark parameters | Tables for training and infrastructure parameters with defaults and impact |
+| Expected output | Example benchmark summary output |
+| Known issues | Documented limitations and workarounds |
+| Cleanup | Commands to remove all created resources |
+
+## Checklist
+
+- [ ] Dockerfile builds successfully: `make build-<name>-benchmark-image`
+- [ ] ClusterTrainingRuntime applies: `oc apply -f benchmarks/<name>/mpi-runtime.yaml`
+- [ ] TrainJob submits and runs: `oc create -f benchmarks/<name>/trainjob.yaml`
+- [ ] README has all required sections
+- [ ] Makefile targets added for build and push
+- [ ] CI workflow triggers on path changes to `benchmarks/<name>/`
diff --git a/.claude/skills/add-e2e-test/SKILL.md b/.claude/skills/add-e2e-test/SKILL.md
new file mode 100644
index 000000000..3e24c53ff
--- /dev/null
+++ b/.claude/skills/add-e2e-test/SKILL.md
@@ -0,0 +1,97 @@
+# Add E2E Test
+
+Guide for adding a new end-to-end test to the distributed-workloads repo.
+
+## Test structure
+
+```go
+func TestMyFeature(t *testing.T) {
+    Tags(t, Tier1)         // 1. tag / skip checks
+    test := With(t)        // 2. create test context
+
+    namespace := test.NewTestNamespace().Name  // 3. isolated namespace
+
+    // 4. create resources with GenerateName
+    // 5. ensure cleanup of cluster-scoped resources
+    // 6. assert with test.Eventually(...)
+}
+```
+
+## Namespace isolation
+
+Every test must operate in its own dedicated namespace. Use `test.NewTestNamespace()` — it creates a uniquely named namespace and registers automatic cleanup (log collection + deletion) via `t.Cleanup`:
+
+```go
+namespace := test.NewTestNamespace().Name
+```
+
+Never use a fixed namespace name unless driven by an env var for a specific scenario (e.g., pre-upgrade/post-upgrade tests). Shared namespaces cause interference between tests.
+
+## Resource naming
+
+All Kubernetes resources must use `GenerateName` instead of a fixed `Name` to avoid collisions:
+
+```go
+// Good
+ObjectMeta: metav1.ObjectMeta{GenerateName: "test-trainjob-"}
+
+// Bad
+ObjectMeta: metav1.ObjectMeta{Name: "my-trainjob"}
+```
+
+## Cleanup
+
+Namespace-scoped resources are deleted automatically when the test namespace is cleaned up. Cluster-scoped resources (e.g., `ClusterRole`, `ClusterRoleBinding`) are not namespace-bound and may need to be explicitly cleaned up if the helper creating them does not already register a cleanup hook via `t.T().Cleanup(...)`.
+
+## Tags
+
+All tests **must** declare a tag -- this is mandatory. Apply it as the first statement so tests are skipped early when `TEST_TIER` is set:
+
+| Tag | When to use |
+|-----|-------------|
+| `Smoke` | Minimal deployment verification |
+| `Tier1`–`Tier3` | Progressively deeper coverage |
+| `Gpu(accelerator)` | Requires at least one GPU node |
+| `MultiGpu(accelerator, n)` | Requires n GPUs per node |
+| `MultiNode(n)` | Requires n worker nodes |
+| `MultiNodeGpu(n, accelerator)` | Requires n nodes each with at least one GPU |
+| `MultiNodeMultiGpu(n, accelerator, gpus)` | Requires n nodes each with at least gpus GPUs |
+
+## Environment variables
+
+Declare env var constants and getter functions in `tests/common/support/environment.go`. Never use `os.Getenv` directly in test files — always go through a getter.
+
+## Editing notebooks
+
+Test notebooks (`tests/**/resources/*.ipynb`) use 1-space JSON indentation with no trailing newline. When editing notebook cells, preserve the array-of-lines source format — do not collapse source arrays into single strings:
+
+```json
+// Good — array of lines, readable in raw JSON
+"source": [
+ "import os\n",
+ "print('hello')"
+]
+
+// Bad — single string, hard to read in raw JSON
+"source": "import os\nprint('hello')"
+```
+
+If a tool (e.g. `NotebookEdit`) converts the edited cell's source to a single string, convert it back to array-of-lines before committing. You can use a Python script:
+
+```python
+import json
+with open(path, encoding="utf-8") as f:
+    nb = json.load(f)
+for cell in nb["cells"]:
+    if isinstance(cell["source"], str):
+        cell["source"] = cell["source"].splitlines(True)
+        # Ensure last line has no trailing newline (notebook convention)
+        if cell["source"] and cell["source"][-1].endswith("\n"):
+            cell["source"][-1] = cell["source"][-1][:-1]
+with open(path, "w", encoding="utf-8") as f:
+    json.dump(nb, f, indent=1, ensure_ascii=False)
+```
+
+## Key support library files
+
+See the [update-support-lib skill](../update-support-lib/SKILL.md) for the full file map. The most frequently used files when writing tests: `test.go` (Test interface), `client.go` (API clients), `environment.go` (env var getters), and the per-API helpers (`trainjob.go`, `pytorchjob.go`, `ray.go`, `kueue.go`).
diff --git a/.claude/skills/update-support-lib/SKILL.md b/.claude/skills/update-support-lib/SKILL.md
new file mode 100644
index 000000000..bdbebc906
--- /dev/null
+++ b/.claude/skills/update-support-lib/SKILL.md
@@ -0,0 +1,237 @@
+# Update Support Library
+
+Guide for modifying the shared test support library at `tests/common/support/`.
+
+## File organization
+
+| File | Domain |
+|------|--------|
+| `test.go` | Test interface (`With(t)`, `Ctx()`, `Client()`, `NewTestNamespace()`) |
+| `client.go` | Client interface with 13 API accessors (Core, Trainer, Kubeflow, Ray, etc.) |
+| `namespace.go` | Namespace creation, cleanup, log/event capture |
+| `environment.go` | Environment variable constants and typed getter functions |
+| `defaults.go` | Hardcoded default image versions and fallback values |
+| `core.go` | Pod, ConfigMap, Secret, PVC helpers |
+| `trainjob.go` | TrainJob getters and condition checkers |
+| `pytorchjob.go` | PyTorchJob getters and condition checkers |
+| `ray.go` | RayJob/RayCluster helpers |
+| `kueue.go` | ResourceFlavor, ClusterQueue, LocalQueue helpers |
+| `conditions.go` | Generic Kubernetes condition evaluation |
+| `events.go` | Event capture and formatting for debugging |
+| `rbac.go` | Role/RoleBinding creation |
+| `accelerator.go` | GPU node detection |
+| `fakeclient.go` | Fake client setup for unit tests (`NewTest(t)`) |
+
+## Async getter pattern
+
+Resource getters return a closure for use with `test.Eventually(...)`:
+
+```go
+func TrainJob(t Test, namespace, name string) func(g gomega.Gomega) *trainerv1alpha1.TrainJob {
+    return func(g gomega.Gomega) *trainerv1alpha1.TrainJob {
+        job, err := t.Client().Trainer().TrainerV1alpha1().TrainJobs(namespace).Get(t.Ctx(), name, metav1.GetOptions{})
+        g.Expect(err).NotTo(gomega.HaveOccurred())
+        return job
+    }
+}
+```
+
+Follow this pattern when adding getters for new resource types. The outer function captures the test context; the inner function is retried by gomega.
+
+## Resource creation pattern
+
+```go
+func CreateMyResource(t Test, namespace string, content map[string][]byte) *corev1.MyResource {
+    t.T().Helper()
+
+    resource := &corev1.MyResource{
+        TypeMeta: metav1.TypeMeta{
+            APIVersion: corev1.SchemeGroupVersion.String(),
+            Kind:       "MyResource",
+        },
+        ObjectMeta: metav1.ObjectMeta{
+            GenerateName: "my-resource-",
+            Namespace:    namespace,
+        },
+        // ... fields
+    }
+
+    resource, err := t.Client().Core().CoreV1().MyResources(namespace).Create(t.Ctx(), resource, metav1.CreateOptions{})
+    t.Expect(err).NotTo(gomega.HaveOccurred())
+    t.T().Logf("Created MyResource %s/%s successfully", namespace, resource.Name)
+
+    return resource
+}
+```
+
+Key conventions:
+- Always call `t.T().Helper()` first
+- Use `GenerateName`, never fixed `Name`
+- Assert errors with `t.Expect(err).NotTo(gomega.HaveOccurred())`
+- Log the created resource name
+
+## Condition checker pattern
+
+```go
+func MyResourceConditionReady(resource *v1alpha1.MyResource) metav1.ConditionStatus {
+    return MyResourceCondition(resource, v1alpha1.MyResourceReady)
+}
+
+func MyResourceCondition(resource *v1alpha1.MyResource, conditionType string) metav1.ConditionStatus {
+    for _, condition := range resource.Status.Conditions {
+        if string(condition.Type) == conditionType {
+            return condition.Status
+        }
+    }
+    return metav1.ConditionUnknown
+}
+```
+
+Create one exported function per condition type (Ready, Failed, Complete, etc.) that delegates to a generic condition extractor.
+
+## Option pattern
+
+Used for flexible configuration of namespace, PVC, and other resources:
+
+```go
+type Option[T any] interface {
+    ApplyTo(to T) error
+}
+
+type ErrorOption[T any] func(to T) error
+func (f ErrorOption[T]) ApplyTo(to T) error { return f(to) }
+```
+
+Example - adding a label to a namespace:
+
+```go
+func WithKueueManaged() Option[*corev1.Namespace] {
+    return ErrorOption[*corev1.Namespace](func(ns *corev1.Namespace) error {
+        if ns.Labels == nil {
+            ns.Labels = make(map[string]string)
+        }
+        ns.Labels["kueue.x-k8s.io/managed"] = "true"
+        return nil
+    })
+}
+```
+
+Options are applied via a loop before the API call:
+
+```go
+for _, option := range options {
+    t.Expect(option.ApplyTo(resource)).To(gomega.Succeed())
+}
+```
+
+## Adding a new API client
+
+To add a client for a new Kubernetes API:
+
+1. **Add the import** in `client.go`:
+   ```go
+   newclient "github.com/org/project/pkg/client/clientset/versioned"
+   ```
+
+2. **Extend the `Client` interface**:
+   ```go
+   NewAPI() newclient.Interface
+   ```
+
+3. **Add a field to `testClient` struct**:
+   ```go
+   newAPI newclient.Interface
+   ```
+
+4. **Add the accessor method**:
+   ```go
+   func (t *testClient) NewAPI() newclient.Interface { return t.newAPI }
+   ```
+
+5. **Initialize in `newTestClient()`** (in `test.go`):
+   ```go
+   newAPI, err := newclient.NewForConfig(cfg)
+   // handle error
+   ```
+
+6. **Update `fakeclient.go`** to include the new client for unit tests.
+
+7. **Run `go mod tidy`** to pull the new dependency.
+
+## Adding environment variables
+
+Follow the constant + getter pattern in `environment.go`:
+
+```go
+const (
+    MyNewVar = "MY_NEW_VAR"
+)
+
+func GetMyNewVar(t Test) string {
+    t.T().Helper()
+    return lookupEnvOrDefault(t, MyNewVar, "default-value")
+}
+```
+
+For training images that support operator-injected defaults, use the three-level resolution in `defaults.go`:
+1. Test env var (e.g., `TEST_TRAINING_CUDA_PYTORCH_28_IMAGE`)
+2. Operator `RELATED_IMAGE_*` env var
+3. Hardcoded default in `defaults.go`
+
+## Writing unit tests
+
+Use `NewTest(t)` from `fakeclient.go` to create a test context with fake clients:
+
+```go
+func TestMyHelper(t *testing.T) {
+    test := NewTest(t)
+
+    // Create test fixtures via fake client
+    resource := &corev1.Pod{
+        ObjectMeta: metav1.ObjectMeta{
+            Name:      "test-pod",
+            Namespace: "test-namespace",
+        },
+    }
+    test.client.Core().CoreV1().Pods("test-namespace").Create(test.ctx, resource, metav1.CreateOptions{})
+
+    // Call the function under test
+    result := GetPods(test, "test-namespace", metav1.ListOptions{})
+
+    // Assert
+    test.Expect(result).Should(gomega.HaveLen(1))
+    test.Expect(result[0].Name).To(gomega.Equal("test-pod"))
+}
+```
+
+See `core_test.go`, `trainjob_test.go`, `environment_test.go` for more examples.
+
+## Per-suite extensions
+
+Put helpers in per-suite `support.go` (e.g., `tests/trainer/support.go`) when they:
+- Use embedded test resources specific to that suite
+- Reference suite-specific APIs or configurations
+- Would not be useful to other test suites
+
+Put helpers in `tests/common/support/` when they:
+- Work with standard Kubernetes or shared custom resources
+- Could be reused across multiple test suites
+
+## Validation
+
+```bash
+make unit-test                                        # Run all support lib unit tests
+make golangci-lint LINT_PKG=./tests/common/support/...  # Lint the support package
+go vet ./tests/common/support/...                     # Vet the support package
+make verify-imports                                   # Verify import ordering
+```
+
+## Checklist
+
+- [ ] New helpers follow the async getter or resource creation pattern
+- [ ] `GenerateName` used for all created resources
+- [ ] `t.T().Helper()` called at the top of every helper function
+- [ ] Unit tests added in a corresponding `_test.go` file
+- [ ] `make unit-test` passes
+- [ ] `make golangci-lint LINT_PKG=./tests/common/support/...` passes
+- [ ] `make verify-imports` passes
diff --git a/.cursor/rules/add-benchmark.mdc b/.cursor/rules/add-benchmark.mdc
new file mode 100644
index 000000000..d3df5f90c
--- /dev/null
+++ b/.cursor/rules/add-benchmark.mdc
@@ -0,0 +1,153 @@
+---
+description: "Guide for adding benchmarks to benchmarks/"
+globs: "benchmarks/**/*"
+alwaysApply: false
+---
+
+# Add Benchmark
+
+Guide for adding a new benchmark to `benchmarks/` in the distributed-workloads repo.
+
+## Directory layout
+
+Each benchmark lives in its own subdirectory under `benchmarks/`:
+
+```
+benchmarks/<benchmark-name>/
+  Dockerfile              # Multi-stage build for the benchmark image
+  Dockerfile.cuda         # (optional) CUDA variant
+  mpi-runtime.yaml        # ClusterTrainingRuntime defining the MPI execution environment
+  trainjob.yaml           # TrainJob manifest to submit the benchmark
+  README.md               # Documentation (what, files, quick start, parameters, output)
+  <scripts>               # (optional) Training/benchmark scripts mounted via ConfigMap
+```
+
+See `benchmarks/osu-benchmarks/` and `benchmarks/kftv2-mpi-ddp-sft/` as reference implementations.
+
+## Dockerfile conventions
+
+Follow the multi-stage build pattern used in `benchmarks/osu-benchmarks/Dockerfile`:
+
+1. **Stage 1 (builder)** - compile dependencies from source (e.g., OpenMPI, benchmark binaries)
+2. **Stage 2 (runtime)** - copy built artifacts, configure SSH for MPI, set up the runtime environment
+
+Key requirements:
+- Base image from `quay.io/opendatahub/` or `quay.io/modh/`
+- `USER 0` only during build stages; final image must use `USER 1001`
+- OpenShift GID 0 pattern: `chgrp -R 0 <dir> && chmod -R g=u <dir>`
+- Allow random UID: `chmod g=u /etc/passwd`
+- SSH authentication via Training Operator's `sshAuthMountPath` -- keys are auto-injected at the path specified in the ClusterTrainingRuntime, not baked into the image. Workers generate host keys at startup.
+- For CUDA variants, create a separate `Dockerfile.cuda` extending the base
+
+## ClusterTrainingRuntime
+
+Define a `ClusterTrainingRuntime` resource with MPI configuration. Key fields:
+
+```yaml
+apiVersion: trainer.kubeflow.org/v1alpha1
+kind: ClusterTrainingRuntime
+metadata:
+  name: <runtime-name>
+spec:
+  mlPolicy:
+    mpi:
+      mpiImplementation: OpenMPI
+      sshAuthMountPath: /tmp/ssh
+  template:
+    spec:
+      replicatedJobs:
+        - name: launcher
+          replicas: 1
+          template: ...
+        - name: worker
+          replicas: <N>
+          template: ...
+```
+
+- Launcher: runs the benchmark command (mpirun/mpiexec)
+- Workers: run sshd and wait for MPI connections
+- Both need the SSH setup commands in their entrypoints
+
+See `benchmarks/osu-benchmarks/mpi-runtime-cpu.yaml` for a complete example.
+
+## TrainJob
+
+Submit benchmarks using a `TrainJob` with `generateName` (not fixed `name`):
+
+```yaml
+apiVersion: trainer.kubeflow.org/v1alpha1
+kind: TrainJob
+metadata:
+  generateName: <benchmark-name>-
+  namespace: <namespace>
+spec:
+  runtimeRef:
+    apiGroup: trainer.kubeflow.org
+    kind: ClusterTrainingRuntime
+    name: <runtime-name>
+  trainer:
+    numNodes: 2
+    resourcesPerNode:
+      requests:
+        nvidia.com/gpu: "2"
+    env:
+      - name: PARAM_NAME
+        value: "value"
+```
+
+Use `trainer.env` for benchmark parameters - the controller injects them into all pod containers.
+
+See `benchmarks/kftv2-mpi-ddp-sft/trainjob.yaml` for a complete example.
+
+## Makefile targets
+
+Add build/push targets to the root `Makefile` following the existing pattern:
+
+```makefile
+BENCHMARK_VERSION ?= latest
+
+.PHONY: build-<name>-benchmark-image
+build-<name>-benchmark-image:
+	$(CONTAINER_ENGINE) build -t quay.io/modh/distributed-workloads-benchmark:trainer-mpi-<name>-$(BENCHMARK_VERSION) \
+	  -f benchmarks/<name>/Dockerfile benchmarks/<name>/
+
+.PHONY: push-<name>-benchmark-image
+push-<name>-benchmark-image:
+	$(CONTAINER_ENGINE) push quay.io/modh/distributed-workloads-benchmark:trainer-mpi-<name>-$(BENCHMARK_VERSION)
+```
+
+Registry: `quay.io/modh/distributed-workloads-benchmark`
+Tag format: `trainer-mpi-<name>-<version>`
+
+## CI workflow
+
+Create `.github/workflows/build-and-push-<name>-benchmark.yml` matching the structure in `build-and-push-osu-benchmark.yml`:
+
+- Trigger on push/PR when files under `benchmarks/<name>/` change
+- Build on all branches, push only on `main`
+- Use `docker/build-push-action` with appropriate Dockerfile path
+
+## README
+
+Every benchmark must include a `README.md` with these sections (see `benchmarks/kftv2-mpi-ddp-sft/README.md`):
+
+| Section | Content |
+|---------|---------|
+| Title + summary | One-line description of what the benchmark measures |
+| What this benchmark does | Table with algorithm, model, dataset, backend, runtime, image |
+| Files | Table mapping each file to its purpose |
+| Quick start | Numbered steps: deploy runtime, create namespace/ConfigMap, submit TrainJob, monitor |
+| Scaling | Table showing node/GPU configurations |
+| Benchmark parameters | Tables for training and infrastructure parameters with defaults and impact |
+| Expected output | Example benchmark summary output |
+| Known issues | Documented limitations and workarounds |
+| Cleanup | Commands to remove all created resources |
+
+## Checklist
+
+- [ ] Dockerfile builds successfully: `make build-<name>-benchmark-image`
+- [ ] ClusterTrainingRuntime applies: `oc apply -f benchmarks/<name>/mpi-runtime.yaml`
+- [ ] TrainJob submits and runs: `oc create -f benchmarks/<name>/trainjob.yaml`
+- [ ] README has all required sections
+- [ ] Makefile targets added for build and push
+- [ ] CI workflow triggers on path changes to `benchmarks/<name>/`
diff --git a/.cursor/rules/add-e2e-test.mdc b/.cursor/rules/add-e2e-test.mdc
new file mode 100644
index 000000000..c5d597870
--- /dev/null
+++ b/.cursor/rules/add-e2e-test.mdc
@@ -0,0 +1,103 @@
+---
+description: "Guide for adding E2E tests to the distributed-workloads repo"
+globs: "tests/**/*.go"
+alwaysApply: false
+---
+
+# Add E2E Test
+
+Guide for adding a new end-to-end test to the distributed-workloads repo.
+
+## Test structure
+
+```go
+func TestMyFeature(t *testing.T) {
+    Tags(t, Tier1)         // 1. tag / skip checks
+    test := With(t)        // 2. create test context
+
+    namespace := test.NewTestNamespace().Name  // 3. isolated namespace
+
+    // 4. create resources with GenerateName
+    // 5. ensure cleanup of cluster-scoped resources
+    // 6. assert with test.Eventually(...)
+}
+```
+
+## Namespace isolation
+
+Every test must operate in its own dedicated namespace. Use `test.NewTestNamespace()` — it creates a uniquely named namespace and registers automatic cleanup (log collection + deletion) via `t.Cleanup`:
+
+```go
+namespace := test.NewTestNamespace().Name
+```
+
+Never use a fixed namespace name unless driven by an env var for a specific scenario (e.g., pre-upgrade/post-upgrade tests). Shared namespaces cause interference between tests.
+
+## Resource naming
+
+All Kubernetes resources must use `GenerateName` instead of a fixed `Name` to avoid collisions:
+
+```go
+// Good
+ObjectMeta: metav1.ObjectMeta{GenerateName: "test-trainjob-"}
+
+// Bad
+ObjectMeta: metav1.ObjectMeta{Name: "my-trainjob"}
+```
+
+## Cleanup
+
+Namespace-scoped resources are deleted automatically when the test namespace is cleaned up. Cluster-scoped resources (e.g., `ClusterRole`, `ClusterRoleBinding`) are not namespace-bound and may need to be explicitly cleaned up if the helper creating them does not already register a cleanup hook via `t.T().Cleanup(...)`.
+
+## Tags
+
+All tests **must** declare a tag -- this is mandatory. Apply it as the first statement so tests are skipped early when `TEST_TIER` is set:
+
+| Tag | When to use |
+|-----|-------------|
+| `Smoke` | Minimal deployment verification |
+| `Tier1`–`Tier3` | Progressively deeper coverage |
+| `Gpu(accelerator)` | Requires at least one GPU node |
+| `MultiGpu(accelerator, n)` | Requires n GPUs per node |
+| `MultiNode(n)` | Requires n worker nodes |
+| `MultiNodeGpu(n, accelerator)` | Requires n nodes each with at least one GPU |
+| `MultiNodeMultiGpu(n, accelerator, gpus)` | Requires n nodes each with at least gpus GPUs |
+
+## Environment variables
+
+Declare env var constants and getter functions in `tests/common/support/environment.go`. Never use `os.Getenv` directly in test files — always go through a getter.
+
+## Editing notebooks
+
+Test notebooks (`tests/**/resources/*.ipynb`) use 1-space JSON indentation with no trailing newline. When editing notebook cells, preserve the array-of-lines source format — do not collapse source arrays into single strings:
+
+```json
+// Good — array of lines, readable in raw JSON
+"source": [
+ "import os\n",
+ "print('hello')"
+]
+
+// Bad — single string, hard to read in raw JSON
+"source": "import os\nprint('hello')"
+```
+
+If a tool (e.g. `NotebookEdit`) converts the edited cell's source to a single string, convert it back to array-of-lines before committing. You can use a Python script:
+
+```python
+import json
+with open(path, encoding="utf-8") as f:
+    nb = json.load(f)
+for cell in nb["cells"]:
+    if isinstance(cell["source"], str):
+        cell["source"] = cell["source"].splitlines(True)
+        # Ensure last line has no trailing newline (notebook convention)
+        if cell["source"] and cell["source"][-1].endswith("\n"):
+            cell["source"][-1] = cell["source"][-1][:-1]
+with open(path, "w", encoding="utf-8") as f:
+    json.dump(nb, f, indent=1, ensure_ascii=False)
+```
+
+## Key support library files
+
+See the [update-support-lib skill](../update-support-lib/SKILL.md) for the full file map. The most frequently used files when writing tests: `test.go` (Test interface), `client.go` (API clients), `environment.go` (env var getters), and the per-API helpers (`trainjob.go`, `pytorchjob.go`, `ray.go`, `kueue.go`).
diff --git a/.cursor/rules/update-support-lib.mdc b/.cursor/rules/update-support-lib.mdc
new file mode 100644
index 000000000..4d66b336f
--- /dev/null
+++ b/.cursor/rules/update-support-lib.mdc
@@ -0,0 +1,243 @@
+---
+description: "Guide for modifying the shared test support library"
+globs: "tests/common/support/**/*.go"
+alwaysApply: false
+---
+
+# Update Support Library
+
+Guide for modifying the shared test support library at `tests/common/support/`.
+
+## File organization
+
+| File | Domain |
+|------|--------|
+| `test.go` | Test interface (`With(t)`, `Ctx()`, `Client()`, `NewTestNamespace()`) |
+| `client.go` | Client interface with 13 API accessors (Core, Trainer, Kubeflow, Ray, etc.) |
+| `namespace.go` | Namespace creation, cleanup, log/event capture |
+| `environment.go` | Environment variable constants and typed getter functions |
+| `defaults.go` | Hardcoded default image versions and fallback values |
+| `core.go` | Pod, ConfigMap, Secret, PVC helpers |
+| `trainjob.go` | TrainJob getters and condition checkers |
+| `pytorchjob.go` | PyTorchJob getters and condition checkers |
+| `ray.go` | RayJob/RayCluster helpers |
+| `kueue.go` | ResourceFlavor, ClusterQueue, LocalQueue helpers |
+| `conditions.go` | Generic Kubernetes condition evaluation |
+| `events.go` | Event capture and formatting for debugging |
+| `rbac.go` | Role/RoleBinding creation |
+| `accelerator.go` | GPU node detection |
+| `fakeclient.go` | Fake client setup for unit tests (`NewTest(t)`) |
+
+## Async getter pattern
+
+Resource getters return a closure for use with `test.Eventually(...)`:
+
+```go
+func TrainJob(t Test, namespace, name string) func(g gomega.Gomega) *trainerv1alpha1.TrainJob {
+    return func(g gomega.Gomega) *trainerv1alpha1.TrainJob {
+        job, err := t.Client().Trainer().TrainerV1alpha1().TrainJobs(namespace).Get(t.Ctx(), name, metav1.GetOptions{})
+        g.Expect(err).NotTo(gomega.HaveOccurred())
+        return job
+    }
+}
+```
+
+Follow this pattern when adding getters for new resource types. The outer function captures the test context; the inner function is retried by gomega.
+
+## Resource creation pattern
+
+```go
+func CreateMyResource(t Test, namespace string, content map[string][]byte) *corev1.MyResource {
+    t.T().Helper()
+
+    resource := &corev1.MyResource{
+        TypeMeta: metav1.TypeMeta{
+            APIVersion: corev1.SchemeGroupVersion.String(),
+            Kind:       "MyResource",
+        },
+        ObjectMeta: metav1.ObjectMeta{
+            GenerateName: "my-resource-",
+            Namespace:    namespace,
+        },
+        // ... fields
+    }
+
+    resource, err := t.Client().Core().CoreV1().MyResources(namespace).Create(t.Ctx(), resource, metav1.CreateOptions{})
+    t.Expect(err).NotTo(gomega.HaveOccurred())
+    t.T().Logf("Created MyResource %s/%s successfully", namespace, resource.Name)
+
+    return resource
+}
+```
+
+Key conventions:
+- Always call `t.T().Helper()` first
+- Use `GenerateName`, never fixed `Name`
+- Assert errors with `t.Expect(err).NotTo(gomega.HaveOccurred())`
+- Log the created resource name
+
+## Condition checker pattern
+
+```go
+func MyResourceConditionReady(resource *v1alpha1.MyResource) metav1.ConditionStatus {
+    return MyResourceCondition(resource, v1alpha1.MyResourceReady)
+}
+
+func MyResourceCondition(resource *v1alpha1.MyResource, conditionType string) metav1.ConditionStatus {
+    for _, condition := range resource.Status.Conditions {
+        if string(condition.Type) == conditionType {
+            return condition.Status
+        }
+    }
+    return metav1.ConditionUnknown
+}
+```
+
+Create one exported function per condition type (Ready, Failed, Complete, etc.) that delegates to a generic condition extractor.
+
+## Option pattern
+
+Used for flexible configuration of namespace, PVC, and other resources:
+
+```go
+type Option[T any] interface {
+    ApplyTo(to T) error
+}
+
+type ErrorOption[T any] func(to T) error
+func (f ErrorOption[T]) ApplyTo(to T) error { return f(to) }
+```
+
+Example - adding a label to a namespace:
+
+```go
+func WithKueueManaged() Option[*corev1.Namespace] {
+    return ErrorOption[*corev1.Namespace](func(ns *corev1.Namespace) error {
+        if ns.Labels == nil {
+            ns.Labels = make(map[string]string)
+        }
+        ns.Labels["kueue.x-k8s.io/managed"] = "true"
+        return nil
+    })
+}
+```
+
+Options are applied via a loop before the API call:
+
+```go
+for _, option := range options {
+    t.Expect(option.ApplyTo(resource)).To(gomega.Succeed())
+}
+```
+
+## Adding a new API client
+
+To add a client for a new Kubernetes API:
+
+1. **Add the import** in `client.go`:
+   ```go
+   newclient "github.com/org/project/pkg/client/clientset/versioned"
+   ```
+
+2. **Extend the `Client` interface**:
+   ```go
+   NewAPI() newclient.Interface
+   ```
+
+3. **Add a field to `testClient` struct**:
+   ```go
+   newAPI newclient.Interface
+   ```
+
+4. **Add the accessor method**:
+   ```go
+   func (t *testClient) NewAPI() newclient.Interface { return t.newAPI }
+   ```
+
+5. **Initialize in `newTestClient()`** (in `test.go`):
+   ```go
+   newAPI, err := newclient.NewForConfig(cfg)
+   // handle error
+   ```
+
+6. **Update `fakeclient.go`** to include the new client for unit tests.
+
+7. **Run `go mod tidy`** to pull the new dependency.
+
+## Adding environment variables
+
+Follow the constant + getter pattern in `environment.go`:
+
+```go
+const (
+    MyNewVar = "MY_NEW_VAR"
+)
+
+func GetMyNewVar(t Test) string {
+    t.T().Helper()
+    return lookupEnvOrDefault(t, MyNewVar, "default-value")
+}
+```
+
+For training images that support operator-injected defaults, use the three-level resolution in `defaults.go`:
+1. Test env var (e.g., `TEST_TRAINING_CUDA_PYTORCH_28_IMAGE`)
+2. Operator `RELATED_IMAGE_*` env var
+3. Hardcoded default in `defaults.go`
+
+## Writing unit tests
+
+Use `NewTest(t)` from `fakeclient.go` to create a test context with fake clients:
+
+```go
+func TestMyHelper(t *testing.T) {
+    test := NewTest(t)
+
+    // Create test fixtures via fake client
+    resource := &corev1.Pod{
+        ObjectMeta: metav1.ObjectMeta{
+            Name:      "test-pod",
+            Namespace: "test-namespace",
+        },
+    }
+    test.client.Core().CoreV1().Pods("test-namespace").Create(test.ctx, resource, metav1.CreateOptions{})
+
+    // Call the function under test
+    result := GetPods(test, "test-namespace", metav1.ListOptions{})
+
+    // Assert
+    test.Expect(result).Should(gomega.HaveLen(1))
+    test.Expect(result[0].Name).To(gomega.Equal("test-pod"))
+}
+```
+
+See `core_test.go`, `trainjob_test.go`, `environment_test.go` for more examples.
+
+## Per-suite extensions
+
+Put helpers in per-suite `support.go` (e.g., `tests/trainer/support.go`) when they:
+- Use embedded test resources specific to that suite
+- Reference suite-specific APIs or configurations
+- Would not be useful to other test suites
+
+Put helpers in `tests/common/support/` when they:
+- Work with standard Kubernetes or shared custom resources
+- Could be reused across multiple test suites
+
+## Validation
+
+```bash
+make unit-test                                        # Run all support lib unit tests
+make golangci-lint LINT_PKG=./tests/common/support/...  # Lint the support package
+go vet ./tests/common/support/...                     # Vet the support package
+make verify-imports                                   # Verify import ordering
+```
+
+## Checklist
+
+- [ ] New helpers follow the async getter or resource creation pattern
+- [ ] `GenerateName` used for all created resources
+- [ ] `t.T().Helper()` called at the top of every helper function
+- [ ] Unit tests added in a corresponding `_test.go` file
+- [ ] `make unit-test` passes
+- [ ] `make golangci-lint LINT_PKG=./tests/common/support/...` passes
+- [ ] `make verify-imports` passes
diff --git a/AGENTS.md b/AGENTS.md
index 1f3da2fd6..0a5529ea9 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -2,6 +2,8 @@
 
 E2E test suite for distributed workloads on RHOAI covering KFTO v1, Trainer v2, and KubeRay, plus training examples and runtime/test images. Built with Go, Python, Kubernetes, Ray, PyTorch.
 
+See [ARCHITECTURE.md](ARCHITECTURE.md) for the full repository structure including test suites, images, benchmarks, and examples.
+
 ## Structure
 
 - `tests/` - E2E test suites (Go)
@@ -51,111 +53,45 @@ make precommit                                    # Run all pre-commit hooks
 
 ### Targeted lint/format
 
-For quick feedback on specific files instead of running project-wide:
-
 ```bash
-# Go
-make golangci-lint LINT_PKG=./tests/common/support/...    # Lint a single Go package
-go vet ./tests/common/support/...                         # Vet a single Go package
-gofmt -w path/to/file.go                                  # Format a single Go file
-
-# Python
-pre-commit run --files path/to/file.py                    # Run all hooks on a single file
-
+make golangci-lint LINT_PKG=./path/to/package/...    # Lint a single Go package
+go vet ./path/to/package/...                         # Vet a single Go package
+gofmt -w path/to/file.go                             # Format a single Go file
+pre-commit run --files path/to/file.py               # Run all hooks on a single file
 ```
 
 ## Writing Tests
 
-### Namespace isolation
-
-Every test must operate in its own dedicated namespace. Use `test.NewTestNamespace()` — it creates a uniquely named namespace and registers automatic cleanup (log collection + deletion) via `t.Cleanup`:
-
-```go
-namespace := test.NewTestNamespace().Name
-```
-
-Never use a fixed namespace name unless driven by an env var for a specific scenario (e.g., pre-upgrade/post-upgrade tests). Shared namespaces cause interference between tests.
-
-### Resource naming
-
-All Kubernetes resources must use `GenerateName` instead of a fixed `Name` to avoid collisions:
+See [`.claude/skills/add-e2e-test/SKILL.md`](.claude/skills/add-e2e-test/SKILL.md) for the full guide on writing E2E tests (namespace isolation, resource naming, cleanup, tags, notebook editing, environment variables).
 
-```go
-// Good
-ObjectMeta: metav1.ObjectMeta{GenerateName: "test-trainjob-"}
+## Benchmarks
 
-// Bad
-ObjectMeta: metav1.ObjectMeta{Name: "my-trainjob"}
-```
-
-### Cleanup
-
-Namespace-scoped resources are deleted automatically when the test namespace is cleaned up. Cluster-scoped resources (e.g., `ClusterRole`, `ClusterRoleBinding`) are not namespace-bound and may need to be explicitly cleaned up if the helper creating them does not already register a cleanup hook via `t.T().Cleanup(...)`.
-
-### Test structure
-
-```go
-func TestMyFeature(t *testing.T) {
-    Tags(t, Tier1)         // 1. tag / skip checks
-    test := With(t)        // 2. create test context
+See [`.claude/skills/add-benchmark/SKILL.md`](.claude/skills/add-benchmark/SKILL.md) for the guide on adding new benchmarks (Dockerfile, ClusterTrainingRuntime, TrainJob, CI workflow).
 
-    namespace := test.NewTestNamespace().Name  // 3. isolated namespace
+## Support Library
 
-    // 4. create resources with GenerateName
-    // 5. ensure cleanup of cluster-scoped resources
-    // 6. assert with test.Eventually(...)
-}
-```
-
-### Editing notebooks
+See [`.claude/skills/update-support-lib/SKILL.md`](.claude/skills/update-support-lib/SKILL.md) for the guide on modifying the shared test support library (getters, condition checkers, client abstraction, option pattern).
 
-Test notebooks (`tests/**/resources/*.ipynb`) use 1-space JSON indentation with no trailing newline. When editing notebook cells, preserve the array-of-lines source format — do not collapse source arrays into single strings:
+## Common Workflows
 
-```json
-// Good — array of lines, readable in raw JSON
-"source": [
- "import os\n",
- "print('hello')"
-]
+The most frequent tasks in this repo, based on commit history:
 
-// Bad — single string, hard to read in raw JSON
-"source": "import os\nprint('hello')"
-```
-
-If a tool (e.g. `NotebookEdit`) converts the edited cell's source to a single string, convert it back to array-of-lines before committing. You can use a Python script:
-
-```python
-import json
-with open(path, encoding="utf-8") as f:
-    nb = json.load(f)
-for cell in nb["cells"]:
-    if isinstance(cell["source"], str):
-        cell["source"] = cell["source"].splitlines(True)
-        # Ensure last line has no trailing newline (notebook convention)
-        if cell["source"] and cell["source"][-1].endswith("\n"):
-            cell["source"][-1] = cell["source"][-1][:-1]
-with open(path, "w", encoding="utf-8") as f:
-    json.dump(nb, f, indent=1, ensure_ascii=False)
-```
+- **CVE-driven Python dependency updates** -- updating a single dependency across training image variants (see CVE Fixes below)
+- **Adding E2E tests** -- see [Writing Tests](#writing-tests)
+- **Adding benchmarks** -- see [Benchmarks](#benchmarks)
+- **Updating the support library** -- see [Support Library](#support-library)
 
-### Environment variables
+Commit message format for JIRA-tracked work: `RHOAIENG-NNNNN: <description> in <image-variant-name>`
 
-Declare env var constants and getter functions in `tests/common/support/environment.go`. Never use `os.Getenv` directly in test files — always go through a getter.
+## CVE Fixes -- Python dependency updates
 
-### Tags
+Two image families with different dependency management:
 
-Tests in `tests/trainer/` **must** declare a tag — this is mandatory. Apply it as the first statement so tests are skipped early when `TEST_TIER` is set:
+- **Runtime training images** (`images/runtime/training/`) use `Pipfile`/`Pipfile.lock` (pipenv) and pull from public PyPI. See [images/runtime/training/README.md](images/runtime/training/README.md).
+- **Universal training images** (`images/universal/training/`) use `pyproject.toml`/`requirements.txt` (pip) and pull from a **private AIPCC PyPI index** -- always query the index for available versions before pinning. See [images/universal/training/README.md](images/universal/training/README.md#cve-fixes--python-dependency-updates).
 
-| Tag | When to use |
-|-----|-------------|
-| `Smoke` | Minimal deployment verification |
-| `Tier1`–`Tier3` | Progressively deeper coverage |
-| `Gpu(accelerator)` | Requires at least one GPU node |
-| `MultiGpu(accelerator, n)` | Requires n GPUs per node |
-| `MultiNode(n)` | Requires n worker nodes |
-| `MultiNodeGpu(n, accelerator)` | Requires n nodes each with at least one GPU |
-| `MultiNodeMultiGpu(n, accelerator, gpus)` | Requires n nodes each with at least gpus GPUs |
+Each image variant is updated independently with its own commit.
 
-## CVE Fixes — Python dependency updates
+## AI Agent Skills
 
-See [images/universal/training/README.md](images/universal/training/README.md#cve-fixes--python-dependency-updates) for instructions on updating Python dependencies in training images. Key point: dependencies come from a private AIPCC PyPI index, not public PyPI — always query the index for available versions before pinning.
+`.claude/skills/` is the canonical source for AI agent skills. Run `make sync-agent-skills` after editing any skill to sync to other tools (Cursor, etc.).
diff --git a/ARCHITECTURE.md b/ARCHITECTURE.md
new file mode 100644
index 000000000..e1e7c71d6
--- /dev/null
+++ b/ARCHITECTURE.md
@@ -0,0 +1,170 @@
+# Architecture
+
+E2E test suite for distributed workloads on Red Hat OpenShift AI (RHOAI), covering Kubeflow Training Operator v1 (KFTO), Kubeflow Trainer v2, and KubeRay.
+
+## Test suites
+
+```text
+tests/
+├── kfto/           KFTO v1 — PyTorchJob-based distributed training
+├── trainer/        Kubeflow Trainer v2 — TrainJob / JobSet-based training
+├── odh/            KubeRay — Ray cluster and RayJob-based training
+├── fms/            Foundation model fine-tuning (fms-hf-tuning)
+│   ├── kfto/         via KFTO PyTorchJob
+│   └── trainer/      via Trainer v2 TrainJob
+└── common/         Shared test infrastructure
+    └── support/      Client abstractions, resource helpers, test lifecycle
+```
+
+### kfto — Kubeflow Training Operator v1
+
+Tests PyTorchJob-based distributed training using the legacy Kubeflow Training Operator. Covers MNIST training (single/multi-node, single/multi-GPU), LLM supervised fine-tuning (SFT), Kueue integration, SDK usage, and upgrade scenarios.
+
+### trainer — Kubeflow Trainer v2
+
+Tests TrainJob-based distributed training using the modern Kubeflow Trainer v2. Covers PyTorch DDP (Fashion MNIST, multi-node/multi-GPU), MPI jobs (OpenMPI), Kueue integration, TrainingRuntime/ClusterTrainingRuntime, Kubeflow SDK, and upgrade scenarios. This is the primary and most actively developed test suite.
+
+### odh — KubeRay
+
+Tests Ray-based distributed training via RayCluster and RayJob. Covers MNIST training, Ray Tune hyperparameter optimization, and LLM fine-tuning with DeepSpeed.
+
+### fms — Foundation model fine-tuning
+
+Tests the fms-hf-tuning container image for LLM fine-tuning (SFT, LoRA, QLoRA) through two parallel orchestration paths: KFTO PyTorchJob (`fms/kfto/`) and Trainer v2 TrainJob (`fms/trainer/`). Both paths test the same training workload with different orchestration, validating that fms-hf-tuning works correctly under each framework. Includes S3 data staging via batch jobs.
+
+## Suite relationships
+
+- **kfto** is the legacy operator; **trainer** is its modern replacement. Both test PyTorch distributed training but via different CRDs (PyTorchJob vs TrainJob).
+- **fms** tests the same fms-hf-tuning workload via both kfto and trainer, ensuring parity across orchestration frameworks.
+- **odh** covers Ray-based parallelism, complementing the PyTorch-based kfto and trainer suites.
+
+## Shared support library
+
+`tests/common/support/` provides the test infrastructure used by all suites (~40 files).
+
+### Test lifecycle
+
+- **`test.go`** — `Test` interface: wraps `*testing.T` with gomega assertions (`Eventually`, `Expect`), context management, and namespace helpers.
+- **`namespace.go`** — `NewTestNamespace()`: creates an isolated namespace per test with automatic cleanup (pod log collection, event capture, namespace deletion) via `t.Cleanup`.
+
+### Client abstraction
+
+- **`client.go`** — `Client` interface: lazy-initialized accessor for multiple Kubernetes API clients:
+  - Core Kubernetes, Dynamic, Storage
+  - Kubeflow Training Operator (`kubeflowclient`)
+  - Kubeflow Trainer v2 (`trainerclient`)
+  - KubeRay (`rayclient`)
+  - Kueue (`kueueclient`), Kueue Operator
+  - JobSet (`jobsetclient`)
+  - OpenShift Machine API, Routes, ImageStreams
+  - OLM (Operator Lifecycle Manager)
+
+### Per-API resource helpers
+
+Each distributed workload API has a dedicated helper file with getters, condition checkers, and builders:
+
+| File | API |
+|------|-----|
+| `pytorchjob.go` | PyTorchJob (Running, Succeeded, Failed, Suspended) |
+| `trainjob.go` | TrainJob (Complete, Failed, Suspended) |
+| `ray.go` | RayJob, RayCluster (status, logs) |
+| `kueue.go` | ResourceFlavor, ClusterQueue, LocalQueue, workload admission |
+| `jobset.go` | JobSet resources |
+
+### Other shared utilities
+
+| File | Purpose |
+|------|---------|
+| `environment.go` | Environment variable getters (never use `os.Getenv` directly) |
+| `core.go` | Pod, ConfigMap, Secret helpers |
+| `rbac.go` | Role / RoleBinding creation for test isolation |
+| `conditions.go` | Kubernetes condition evaluation |
+| `events.go` | Event capture for debugging |
+| `accelerator.go` | GPU node detection |
+
+### Per-suite extensions
+
+Each suite has a `support.go` that imports `tests/common/support` and adds suite-specific utilities (e.g., embedded test resource files via `//go:embed`, Prometheus queries for GPU utilization). Per-suite files extend — never wrap — the common `Test` interface.
+
+### Common utilities outside support/
+
+`tests/common/` (outside `support/`) provides cross-suite utilities:
+
+| File | Purpose |
+|------|---------|
+| `test_tag.go` | Tag functions (`Smoke`, `Tier1`–`Tier3`, `Gpu`, `MultiNode`, etc.) and `Tags()` helper for test filtering |
+| `environment.go` | Shared env var getters (test tier, notebook config, HuggingFace token) |
+| `notebook.go` | Notebook creation with GPU allocation and Kueue integration |
+| `template.go` | Go template parsing for dynamic Kubernetes manifests |
+
+## Benchmarks
+
+```text
+benchmarks/
+├── kftv2-mpi-ddp-sft/    MPI DDP SFT training (Qwen 2.5 + GSM8K)
+│   ├── README.md
+│   ├── mpi-runtime.yaml       ClusterTrainingRuntime
+│   ├── train_sft_ddp.py       Training script (mounted via ConfigMap)
+│   └── trainjob.yaml          TrainJob manifest
+└── osu-benchmarks/        OSU MPI micro-benchmarks (point-to-point + collective)
+    ├── Dockerfile             CPU variant
+    ├── Dockerfile.cuda        CUDA variant
+    ├── mpi-runtime-cpu.yaml   ClusterTrainingRuntime (CPU)
+    ├── mpi-runtime-gpu.yaml   ClusterTrainingRuntime (GPU)
+    ├── osu-trainjob-cpu.yaml  TrainJob (CPU)
+    ├── osu-trainjob-gpu.yaml  TrainJob (GPU)
+    └── uid_entrypoint.sh      UID entrypoint for OpenShift
+```
+
+Each benchmark defines a **ClusterTrainingRuntime** (MPI execution environment) and a **TrainJob** (workload submission). See the [add-benchmark skill](.claude/skills/add-benchmark/SKILL.md) for the full guide.
+
+## Images
+
+```text
+images/
+├── dataset/
+│   └── alpaca/                    Alpaca dataset image
+├── model/
+│   └── bloom560m/                 BLOOM-560M model image
+├── runtime/
+│   ├── training/                  Runtime training images (~10 variants)
+│   │   ├── py311-cuda121-torch241/
+│   │   ├── py311-cuda124-torch251/
+│   │   ├── ...
+│   │   └── py312-rocm64-torch290/
+│   ├── ray/                       Ray runtime images
+│   │   ├── cuda/                    CUDA variants
+│   │   └── rocm/                    ROCm variants
+│   └── examples/                  Example-specific runtime images
+├── universal/
+│   └── training/                  Universal training images (3 variants)
+│       ├── th06-cpu-torch210-py312/
+│       ├── th06-cuda130-torch210-py312/
+│       └── th06-rocm64-torch291-py312/
+├── tests/                         Test runner image
+└── util/
+    └── mc-cli/                    MinIO client utility image
+```
+
+Key distinction for dependency management (matters for CVE fixes):
+
+- **Runtime training images** (`images/runtime/training/`) use `Pipfile`/`Pipfile.lock` (pipenv) and pull from public PyPI. Two openmpi41 variants are exceptions that use `pyproject.toml`/`requirements.txt` instead. See `images/runtime/training/README.md`.
+- **Universal training images** (`images/universal/training/`) use `pyproject.toml`/`requirements.txt` (pip) and pull from a private AIPCC PyPI index. See `images/universal/training/README.md`.
+
+## Examples
+
+```text
+examples/
+├── hpo-raytune/                    Ray Tune HPO on OpenShift AI
+├── kfto-dreambooth/                Stable Diffusion DreamBooth with KFTO
+├── kfto-feast/                     Fine-tuning with Feast feature store
+├── kfto-sft-feast-rag/             SFT + Feast + RAG pipeline
+├── kfto-sft-llm/                   LLM SFT with KFTO
+├── kfto_feast_rag/                 End-to-end RAG with Feast + Milvus
+├── rag-llm/                        RAG with HuggingFace + sentence-transformers
+├── ray-docling/                    Batch document processing with Ray + Docling
+├── ray-finetune-llm-deepspeed/     LLM fine-tuning with Ray + DeepSpeed
+└── stable-diffusion-dreambooth/    Stable Diffusion DreamBooth (standalone)
+```
+
+Each example contains a README, one or more Jupyter notebooks, and supporting resources (datasets, configs, Kubernetes manifests).
diff --git a/Makefile b/Makefile
index 9e2a97301..3f291ec96 100644
--- a/Makefile
+++ b/Makefile
@@ -85,3 +85,7 @@ golangci-lint: golangci-lint-install ## Run golangci-lint on the codebase.
 .PHONY: precommit
 precommit:
 	pre-commit run --all-files
+
+.PHONY: sync-agent-skills
+sync-agent-skills: ## Sync AI agent skills from .claude/skills/ to other tools (Cursor, etc.)
+	@./hack/sync-agent-skills.sh
diff --git a/hack/sync-agent-skills.sh b/hack/sync-agent-skills.sh
new file mode 100755
index 000000000..ff3b6fb6a
--- /dev/null
+++ b/hack/sync-agent-skills.sh
@@ -0,0 +1,53 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+SKILLS_DIR=".claude/skills"
+CURSOR_RULES_DIR=".cursor/rules"
+
+get_description() {
+  case "$1" in
+    add-e2e-test)       echo "Guide for adding E2E tests to the distributed-workloads repo" ;;
+    add-benchmark)      echo "Guide for adding benchmarks to benchmarks/" ;;
+    update-support-lib) echo "Guide for modifying the shared test support library" ;;
+    *)                  echo "$1" ;;
+  esac
+}
+
+get_globs() {
+  case "$1" in
+    add-e2e-test)       echo "tests/**/*.go" ;;
+    add-benchmark)      echo "benchmarks/**/*" ;;
+    update-support-lib) echo "tests/common/support/**/*.go" ;;
+    *)                  echo "" ;;
+  esac
+}
+
+sync_cursor() {
+  mkdir -p "$CURSOR_RULES_DIR"
+
+  for skill_dir in "$SKILLS_DIR"/*/; do
+    name=$(basename "$skill_dir")
+    skill_file="$skill_dir/SKILL.md"
+    [ -f "$skill_file" ] || continue
+
+    desc=$(get_description "$name")
+    glob=$(get_globs "$name")
+    out="$CURSOR_RULES_DIR/$name.mdc"
+
+    {
+      echo "---"
+      echo "description: \"$desc\""
+      [ -n "$glob" ] && echo "globs: \"$glob\""
+      echo "alwaysApply: false"
+      echo "---"
+      echo ""
+      cat "$skill_file"
+    } > "$out"
+
+    echo "  cursor: $out"
+  done
+}
+
+echo "Syncing skills from $SKILLS_DIR ..."
+sync_cursor
+echo "Done."