diff --git a/.claude/settings.json b/.claude/settings.json new file mode 100644 index 000000000..16253061d --- /dev/null +++ b/.claude/settings.json @@ -0,0 +1,15 @@ +{ + "hooks": { + "PostToolUse": [ + { + "matcher": "Edit|Write", + "hooks": [ + { + "type": "command", + "command": "if [[ \"$TOOL_INPUT\" == *\".go\"* ]]; then ./bin/openshift-goimports 2>/dev/null; fi" + } + ] + } + ] + } +} \ No newline at end of file diff --git a/.claude/skills/add-benchmark/SKILL.md b/.claude/skills/add-benchmark/SKILL.md new file mode 100644 index 000000000..4cf83a786 --- /dev/null +++ b/.claude/skills/add-benchmark/SKILL.md @@ -0,0 +1,147 @@ +# Add Benchmark + +Guide for adding a new benchmark to `benchmarks/` in the distributed-workloads repo. + +## Directory layout + +Each benchmark lives in its own subdirectory under `benchmarks/`: + +``` +benchmarks// + Dockerfile # Multi-stage build for the benchmark image + Dockerfile.cuda # (optional) CUDA variant + mpi-runtime.yaml # ClusterTrainingRuntime defining the MPI execution environment + trainjob.yaml # TrainJob manifest to submit the benchmark + README.md # Documentation (what, files, quick start, parameters, output) + # (optional) Training/benchmark scripts mounted via ConfigMap +``` + +See `benchmarks/osu-benchmarks/` and `benchmarks/kftv2-mpi-ddp-sft/` as reference implementations. + +## Dockerfile conventions + +Follow the multi-stage build pattern used in `benchmarks/osu-benchmarks/Dockerfile`: + +1. **Stage 1 (builder)** - compile dependencies from source (e.g., OpenMPI, benchmark binaries) +2. **Stage 2 (runtime)** - copy built artifacts, configure SSH for MPI, set up the runtime environment + +Key requirements: +- Base image from `quay.io/opendatahub/` or `quay.io/modh/` +- `USER 0` only during build stages; final image must use `USER 1001` +- OpenShift GID 0 pattern: `chgrp -R 0 && chmod -R g=u ` +- Allow random UID: `chmod g=u /etc/passwd` +- SSH authentication via Training Operator's `sshAuthMountPath` -- keys are auto-injected at the path specified in the ClusterTrainingRuntime, not baked into the image. Workers generate host keys at startup. +- For CUDA variants, create a separate `Dockerfile.cuda` extending the base + +## ClusterTrainingRuntime + +Define a `ClusterTrainingRuntime` resource with MPI configuration. Key fields: + +```yaml +apiVersion: trainer.kubeflow.org/v1alpha1 +kind: ClusterTrainingRuntime +metadata: + name: +spec: + mlPolicy: + mpi: + mpiImplementation: OpenMPI + sshAuthMountPath: /tmp/ssh + template: + spec: + replicatedJobs: + - name: launcher + replicas: 1 + template: ... + - name: worker + replicas: + template: ... +``` + +- Launcher: runs the benchmark command (mpirun/mpiexec) +- Workers: run sshd and wait for MPI connections +- Both need the SSH setup commands in their entrypoints + +See `benchmarks/osu-benchmarks/mpi-runtime-cpu.yaml` for a complete example. + +## TrainJob + +Submit benchmarks using a `TrainJob` with `generateName` (not fixed `name`): + +```yaml +apiVersion: trainer.kubeflow.org/v1alpha1 +kind: TrainJob +metadata: + generateName: - + namespace: +spec: + runtimeRef: + apiGroup: trainer.kubeflow.org + kind: ClusterTrainingRuntime + name: + trainer: + numNodes: 2 + resourcesPerNode: + requests: + nvidia.com/gpu: "2" + env: + - name: PARAM_NAME + value: "value" +``` + +Use `trainer.env` for benchmark parameters - the controller injects them into all pod containers. + +See `benchmarks/kftv2-mpi-ddp-sft/trainjob.yaml` for a complete example. + +## Makefile targets + +Add build/push targets to the root `Makefile` following the existing pattern: + +```makefile +BENCHMARK_VERSION ?= latest + +.PHONY: build--benchmark-image +build--benchmark-image: + $(CONTAINER_ENGINE) build -t quay.io/modh/distributed-workloads-benchmark:trainer-mpi--$(BENCHMARK_VERSION) \ + -f benchmarks//Dockerfile benchmarks// + +.PHONY: push--benchmark-image +push--benchmark-image: + $(CONTAINER_ENGINE) push quay.io/modh/distributed-workloads-benchmark:trainer-mpi--$(BENCHMARK_VERSION) +``` + +Registry: `quay.io/modh/distributed-workloads-benchmark` +Tag format: `trainer-mpi--` + +## CI workflow + +Create `.github/workflows/build-and-push--benchmark.yml` matching the structure in `build-and-push-osu-benchmark.yml`: + +- Trigger on push/PR when files under `benchmarks//` change +- Build on all branches, push only on `main` +- Use `docker/build-push-action` with appropriate Dockerfile path + +## README + +Every benchmark must include a `README.md` with these sections (see `benchmarks/kftv2-mpi-ddp-sft/README.md`): + +| Section | Content | +|---------|---------| +| Title + summary | One-line description of what the benchmark measures | +| What this benchmark does | Table with algorithm, model, dataset, backend, runtime, image | +| Files | Table mapping each file to its purpose | +| Quick start | Numbered steps: deploy runtime, create namespace/ConfigMap, submit TrainJob, monitor | +| Scaling | Table showing node/GPU configurations | +| Benchmark parameters | Tables for training and infrastructure parameters with defaults and impact | +| Expected output | Example benchmark summary output | +| Known issues | Documented limitations and workarounds | +| Cleanup | Commands to remove all created resources | + +## Checklist + +- [ ] Dockerfile builds successfully: `make build--benchmark-image` +- [ ] ClusterTrainingRuntime applies: `oc apply -f benchmarks//mpi-runtime.yaml` +- [ ] TrainJob submits and runs: `oc create -f benchmarks//trainjob.yaml` +- [ ] README has all required sections +- [ ] Makefile targets added for build and push +- [ ] CI workflow triggers on path changes to `benchmarks//` diff --git a/.claude/skills/add-e2e-test/SKILL.md b/.claude/skills/add-e2e-test/SKILL.md new file mode 100644 index 000000000..3e24c53ff --- /dev/null +++ b/.claude/skills/add-e2e-test/SKILL.md @@ -0,0 +1,97 @@ +# Add E2E Test + +Guide for adding a new end-to-end test to the distributed-workloads repo. + +## Test structure + +```go +func TestMyFeature(t *testing.T) { + Tags(t, Tier1) // 1. tag / skip checks + test := With(t) // 2. create test context + + namespace := test.NewTestNamespace().Name // 3. isolated namespace + + // 4. create resources with GenerateName + // 5. ensure cleanup of cluster-scoped resources + // 6. assert with test.Eventually(...) +} +``` + +## Namespace isolation + +Every test must operate in its own dedicated namespace. Use `test.NewTestNamespace()` — it creates a uniquely named namespace and registers automatic cleanup (log collection + deletion) via `t.Cleanup`: + +```go +namespace := test.NewTestNamespace().Name +``` + +Never use a fixed namespace name unless driven by an env var for a specific scenario (e.g., pre-upgrade/post-upgrade tests). Shared namespaces cause interference between tests. + +## Resource naming + +All Kubernetes resources must use `GenerateName` instead of a fixed `Name` to avoid collisions: + +```go +// Good +ObjectMeta: metav1.ObjectMeta{GenerateName: "test-trainjob-"} + +// Bad +ObjectMeta: metav1.ObjectMeta{Name: "my-trainjob"} +``` + +## Cleanup + +Namespace-scoped resources are deleted automatically when the test namespace is cleaned up. Cluster-scoped resources (e.g., `ClusterRole`, `ClusterRoleBinding`) are not namespace-bound and may need to be explicitly cleaned up if the helper creating them does not already register a cleanup hook via `t.T().Cleanup(...)`. + +## Tags + +All tests **must** declare a tag -- this is mandatory. Apply it as the first statement so tests are skipped early when `TEST_TIER` is set: + +| Tag | When to use | +|-----|-------------| +| `Smoke` | Minimal deployment verification | +| `Tier1`–`Tier3` | Progressively deeper coverage | +| `Gpu(accelerator)` | Requires at least one GPU node | +| `MultiGpu(accelerator, n)` | Requires n GPUs per node | +| `MultiNode(n)` | Requires n worker nodes | +| `MultiNodeGpu(n, accelerator)` | Requires n nodes each with at least one GPU | +| `MultiNodeMultiGpu(n, accelerator, gpus)` | Requires n nodes each with at least gpus GPUs | + +## Environment variables + +Declare env var constants and getter functions in `tests/common/support/environment.go`. Never use `os.Getenv` directly in test files — always go through a getter. + +## Editing notebooks + +Test notebooks (`tests/**/resources/*.ipynb`) use 1-space JSON indentation with no trailing newline. When editing notebook cells, preserve the array-of-lines source format — do not collapse source arrays into single strings: + +```json +// Good — array of lines, readable in raw JSON +"source": [ + "import os\n", + "print('hello')" +] + +// Bad — single string, hard to read in raw JSON +"source": "import os\nprint('hello')" +``` + +If a tool (e.g. `NotebookEdit`) converts the edited cell's source to a single string, convert it back to array-of-lines before committing. You can use a Python script: + +```python +import json +with open(path, encoding="utf-8") as f: + nb = json.load(f) +for cell in nb["cells"]: + if isinstance(cell["source"], str): + cell["source"] = cell["source"].splitlines(True) + # Ensure last line has no trailing newline (notebook convention) + if cell["source"] and cell["source"][-1].endswith("\n"): + cell["source"][-1] = cell["source"][-1][:-1] +with open(path, "w", encoding="utf-8") as f: + json.dump(nb, f, indent=1, ensure_ascii=False) +``` + +## Key support library files + +See the [update-support-lib skill](../update-support-lib/SKILL.md) for the full file map. The most frequently used files when writing tests: `test.go` (Test interface), `client.go` (API clients), `environment.go` (env var getters), and the per-API helpers (`trainjob.go`, `pytorchjob.go`, `ray.go`, `kueue.go`). diff --git a/.claude/skills/update-support-lib/SKILL.md b/.claude/skills/update-support-lib/SKILL.md new file mode 100644 index 000000000..bdbebc906 --- /dev/null +++ b/.claude/skills/update-support-lib/SKILL.md @@ -0,0 +1,237 @@ +# Update Support Library + +Guide for modifying the shared test support library at `tests/common/support/`. + +## File organization + +| File | Domain | +|------|--------| +| `test.go` | Test interface (`With(t)`, `Ctx()`, `Client()`, `NewTestNamespace()`) | +| `client.go` | Client interface with 13 API accessors (Core, Trainer, Kubeflow, Ray, etc.) | +| `namespace.go` | Namespace creation, cleanup, log/event capture | +| `environment.go` | Environment variable constants and typed getter functions | +| `defaults.go` | Hardcoded default image versions and fallback values | +| `core.go` | Pod, ConfigMap, Secret, PVC helpers | +| `trainjob.go` | TrainJob getters and condition checkers | +| `pytorchjob.go` | PyTorchJob getters and condition checkers | +| `ray.go` | RayJob/RayCluster helpers | +| `kueue.go` | ResourceFlavor, ClusterQueue, LocalQueue helpers | +| `conditions.go` | Generic Kubernetes condition evaluation | +| `events.go` | Event capture and formatting for debugging | +| `rbac.go` | Role/RoleBinding creation | +| `accelerator.go` | GPU node detection | +| `fakeclient.go` | Fake client setup for unit tests (`NewTest(t)`) | + +## Async getter pattern + +Resource getters return a closure for use with `test.Eventually(...)`: + +```go +func TrainJob(t Test, namespace, name string) func(g gomega.Gomega) *trainerv1alpha1.TrainJob { + return func(g gomega.Gomega) *trainerv1alpha1.TrainJob { + job, err := t.Client().Trainer().TrainerV1alpha1().TrainJobs(namespace).Get(t.Ctx(), name, metav1.GetOptions{}) + g.Expect(err).NotTo(gomega.HaveOccurred()) + return job + } +} +``` + +Follow this pattern when adding getters for new resource types. The outer function captures the test context; the inner function is retried by gomega. + +## Resource creation pattern + +```go +func CreateMyResource(t Test, namespace string, content map[string][]byte) *corev1.MyResource { + t.T().Helper() + + resource := &corev1.MyResource{ + TypeMeta: metav1.TypeMeta{ + APIVersion: corev1.SchemeGroupVersion.String(), + Kind: "MyResource", + }, + ObjectMeta: metav1.ObjectMeta{ + GenerateName: "my-resource-", + Namespace: namespace, + }, + // ... fields + } + + resource, err := t.Client().Core().CoreV1().MyResources(namespace).Create(t.Ctx(), resource, metav1.CreateOptions{}) + t.Expect(err).NotTo(gomega.HaveOccurred()) + t.T().Logf("Created MyResource %s/%s successfully", namespace, resource.Name) + + return resource +} +``` + +Key conventions: +- Always call `t.T().Helper()` first +- Use `GenerateName`, never fixed `Name` +- Assert errors with `t.Expect(err).NotTo(gomega.HaveOccurred())` +- Log the created resource name + +## Condition checker pattern + +```go +func MyResourceConditionReady(resource *v1alpha1.MyResource) metav1.ConditionStatus { + return MyResourceCondition(resource, v1alpha1.MyResourceReady) +} + +func MyResourceCondition(resource *v1alpha1.MyResource, conditionType string) metav1.ConditionStatus { + for _, condition := range resource.Status.Conditions { + if string(condition.Type) == conditionType { + return condition.Status + } + } + return metav1.ConditionUnknown +} +``` + +Create one exported function per condition type (Ready, Failed, Complete, etc.) that delegates to a generic condition extractor. + +## Option pattern + +Used for flexible configuration of namespace, PVC, and other resources: + +```go +type Option[T any] interface { + ApplyTo(to T) error +} + +type ErrorOption[T any] func(to T) error +func (f ErrorOption[T]) ApplyTo(to T) error { return f(to) } +``` + +Example - adding a label to a namespace: + +```go +func WithKueueManaged() Option[*corev1.Namespace] { + return ErrorOption[*corev1.Namespace](func(ns *corev1.Namespace) error { + if ns.Labels == nil { + ns.Labels = make(map[string]string) + } + ns.Labels["kueue.x-k8s.io/managed"] = "true" + return nil + }) +} +``` + +Options are applied via a loop before the API call: + +```go +for _, option := range options { + t.Expect(option.ApplyTo(resource)).To(gomega.Succeed()) +} +``` + +## Adding a new API client + +To add a client for a new Kubernetes API: + +1. **Add the import** in `client.go`: + ```go + newclient "github.com/org/project/pkg/client/clientset/versioned" + ``` + +2. **Extend the `Client` interface**: + ```go + NewAPI() newclient.Interface + ``` + +3. **Add a field to `testClient` struct**: + ```go + newAPI newclient.Interface + ``` + +4. **Add the accessor method**: + ```go + func (t *testClient) NewAPI() newclient.Interface { return t.newAPI } + ``` + +5. **Initialize in `newTestClient()`** (in `test.go`): + ```go + newAPI, err := newclient.NewForConfig(cfg) + // handle error + ``` + +6. **Update `fakeclient.go`** to include the new client for unit tests. + +7. **Run `go mod tidy`** to pull the new dependency. + +## Adding environment variables + +Follow the constant + getter pattern in `environment.go`: + +```go +const ( + MyNewVar = "MY_NEW_VAR" +) + +func GetMyNewVar(t Test) string { + t.T().Helper() + return lookupEnvOrDefault(t, MyNewVar, "default-value") +} +``` + +For training images that support operator-injected defaults, use the three-level resolution in `defaults.go`: +1. Test env var (e.g., `TEST_TRAINING_CUDA_PYTORCH_28_IMAGE`) +2. Operator `RELATED_IMAGE_*` env var +3. Hardcoded default in `defaults.go` + +## Writing unit tests + +Use `NewTest(t)` from `fakeclient.go` to create a test context with fake clients: + +```go +func TestMyHelper(t *testing.T) { + test := NewTest(t) + + // Create test fixtures via fake client + resource := &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-pod", + Namespace: "test-namespace", + }, + } + test.client.Core().CoreV1().Pods("test-namespace").Create(test.ctx, resource, metav1.CreateOptions{}) + + // Call the function under test + result := GetPods(test, "test-namespace", metav1.ListOptions{}) + + // Assert + test.Expect(result).Should(gomega.HaveLen(1)) + test.Expect(result[0].Name).To(gomega.Equal("test-pod")) +} +``` + +See `core_test.go`, `trainjob_test.go`, `environment_test.go` for more examples. + +## Per-suite extensions + +Put helpers in per-suite `support.go` (e.g., `tests/trainer/support.go`) when they: +- Use embedded test resources specific to that suite +- Reference suite-specific APIs or configurations +- Would not be useful to other test suites + +Put helpers in `tests/common/support/` when they: +- Work with standard Kubernetes or shared custom resources +- Could be reused across multiple test suites + +## Validation + +```bash +make unit-test # Run all support lib unit tests +make golangci-lint LINT_PKG=./tests/common/support/... # Lint the support package +go vet ./tests/common/support/... # Vet the support package +make verify-imports # Verify import ordering +``` + +## Checklist + +- [ ] New helpers follow the async getter or resource creation pattern +- [ ] `GenerateName` used for all created resources +- [ ] `t.T().Helper()` called at the top of every helper function +- [ ] Unit tests added in a corresponding `_test.go` file +- [ ] `make unit-test` passes +- [ ] `make golangci-lint LINT_PKG=./tests/common/support/...` passes +- [ ] `make verify-imports` passes diff --git a/.cursor/rules/add-benchmark.mdc b/.cursor/rules/add-benchmark.mdc new file mode 100644 index 000000000..d3df5f90c --- /dev/null +++ b/.cursor/rules/add-benchmark.mdc @@ -0,0 +1,153 @@ +--- +description: "Guide for adding benchmarks to benchmarks/" +globs: "benchmarks/**/*" +alwaysApply: false +--- + +# Add Benchmark + +Guide for adding a new benchmark to `benchmarks/` in the distributed-workloads repo. + +## Directory layout + +Each benchmark lives in its own subdirectory under `benchmarks/`: + +``` +benchmarks// + Dockerfile # Multi-stage build for the benchmark image + Dockerfile.cuda # (optional) CUDA variant + mpi-runtime.yaml # ClusterTrainingRuntime defining the MPI execution environment + trainjob.yaml # TrainJob manifest to submit the benchmark + README.md # Documentation (what, files, quick start, parameters, output) + # (optional) Training/benchmark scripts mounted via ConfigMap +``` + +See `benchmarks/osu-benchmarks/` and `benchmarks/kftv2-mpi-ddp-sft/` as reference implementations. + +## Dockerfile conventions + +Follow the multi-stage build pattern used in `benchmarks/osu-benchmarks/Dockerfile`: + +1. **Stage 1 (builder)** - compile dependencies from source (e.g., OpenMPI, benchmark binaries) +2. **Stage 2 (runtime)** - copy built artifacts, configure SSH for MPI, set up the runtime environment + +Key requirements: +- Base image from `quay.io/opendatahub/` or `quay.io/modh/` +- `USER 0` only during build stages; final image must use `USER 1001` +- OpenShift GID 0 pattern: `chgrp -R 0 && chmod -R g=u ` +- Allow random UID: `chmod g=u /etc/passwd` +- SSH authentication via Training Operator's `sshAuthMountPath` -- keys are auto-injected at the path specified in the ClusterTrainingRuntime, not baked into the image. Workers generate host keys at startup. +- For CUDA variants, create a separate `Dockerfile.cuda` extending the base + +## ClusterTrainingRuntime + +Define a `ClusterTrainingRuntime` resource with MPI configuration. Key fields: + +```yaml +apiVersion: trainer.kubeflow.org/v1alpha1 +kind: ClusterTrainingRuntime +metadata: + name: +spec: + mlPolicy: + mpi: + mpiImplementation: OpenMPI + sshAuthMountPath: /tmp/ssh + template: + spec: + replicatedJobs: + - name: launcher + replicas: 1 + template: ... + - name: worker + replicas: + template: ... +``` + +- Launcher: runs the benchmark command (mpirun/mpiexec) +- Workers: run sshd and wait for MPI connections +- Both need the SSH setup commands in their entrypoints + +See `benchmarks/osu-benchmarks/mpi-runtime-cpu.yaml` for a complete example. + +## TrainJob + +Submit benchmarks using a `TrainJob` with `generateName` (not fixed `name`): + +```yaml +apiVersion: trainer.kubeflow.org/v1alpha1 +kind: TrainJob +metadata: + generateName: - + namespace: +spec: + runtimeRef: + apiGroup: trainer.kubeflow.org + kind: ClusterTrainingRuntime + name: + trainer: + numNodes: 2 + resourcesPerNode: + requests: + nvidia.com/gpu: "2" + env: + - name: PARAM_NAME + value: "value" +``` + +Use `trainer.env` for benchmark parameters - the controller injects them into all pod containers. + +See `benchmarks/kftv2-mpi-ddp-sft/trainjob.yaml` for a complete example. + +## Makefile targets + +Add build/push targets to the root `Makefile` following the existing pattern: + +```makefile +BENCHMARK_VERSION ?= latest + +.PHONY: build--benchmark-image +build--benchmark-image: + $(CONTAINER_ENGINE) build -t quay.io/modh/distributed-workloads-benchmark:trainer-mpi--$(BENCHMARK_VERSION) \ + -f benchmarks//Dockerfile benchmarks// + +.PHONY: push--benchmark-image +push--benchmark-image: + $(CONTAINER_ENGINE) push quay.io/modh/distributed-workloads-benchmark:trainer-mpi--$(BENCHMARK_VERSION) +``` + +Registry: `quay.io/modh/distributed-workloads-benchmark` +Tag format: `trainer-mpi--` + +## CI workflow + +Create `.github/workflows/build-and-push--benchmark.yml` matching the structure in `build-and-push-osu-benchmark.yml`: + +- Trigger on push/PR when files under `benchmarks//` change +- Build on all branches, push only on `main` +- Use `docker/build-push-action` with appropriate Dockerfile path + +## README + +Every benchmark must include a `README.md` with these sections (see `benchmarks/kftv2-mpi-ddp-sft/README.md`): + +| Section | Content | +|---------|---------| +| Title + summary | One-line description of what the benchmark measures | +| What this benchmark does | Table with algorithm, model, dataset, backend, runtime, image | +| Files | Table mapping each file to its purpose | +| Quick start | Numbered steps: deploy runtime, create namespace/ConfigMap, submit TrainJob, monitor | +| Scaling | Table showing node/GPU configurations | +| Benchmark parameters | Tables for training and infrastructure parameters with defaults and impact | +| Expected output | Example benchmark summary output | +| Known issues | Documented limitations and workarounds | +| Cleanup | Commands to remove all created resources | + +## Checklist + +- [ ] Dockerfile builds successfully: `make build--benchmark-image` +- [ ] ClusterTrainingRuntime applies: `oc apply -f benchmarks//mpi-runtime.yaml` +- [ ] TrainJob submits and runs: `oc create -f benchmarks//trainjob.yaml` +- [ ] README has all required sections +- [ ] Makefile targets added for build and push +- [ ] CI workflow triggers on path changes to `benchmarks//` diff --git a/.cursor/rules/add-e2e-test.mdc b/.cursor/rules/add-e2e-test.mdc new file mode 100644 index 000000000..c5d597870 --- /dev/null +++ b/.cursor/rules/add-e2e-test.mdc @@ -0,0 +1,103 @@ +--- +description: "Guide for adding E2E tests to the distributed-workloads repo" +globs: "tests/**/*.go" +alwaysApply: false +--- + +# Add E2E Test + +Guide for adding a new end-to-end test to the distributed-workloads repo. + +## Test structure + +```go +func TestMyFeature(t *testing.T) { + Tags(t, Tier1) // 1. tag / skip checks + test := With(t) // 2. create test context + + namespace := test.NewTestNamespace().Name // 3. isolated namespace + + // 4. create resources with GenerateName + // 5. ensure cleanup of cluster-scoped resources + // 6. assert with test.Eventually(...) +} +``` + +## Namespace isolation + +Every test must operate in its own dedicated namespace. Use `test.NewTestNamespace()` — it creates a uniquely named namespace and registers automatic cleanup (log collection + deletion) via `t.Cleanup`: + +```go +namespace := test.NewTestNamespace().Name +``` + +Never use a fixed namespace name unless driven by an env var for a specific scenario (e.g., pre-upgrade/post-upgrade tests). Shared namespaces cause interference between tests. + +## Resource naming + +All Kubernetes resources must use `GenerateName` instead of a fixed `Name` to avoid collisions: + +```go +// Good +ObjectMeta: metav1.ObjectMeta{GenerateName: "test-trainjob-"} + +// Bad +ObjectMeta: metav1.ObjectMeta{Name: "my-trainjob"} +``` + +## Cleanup + +Namespace-scoped resources are deleted automatically when the test namespace is cleaned up. Cluster-scoped resources (e.g., `ClusterRole`, `ClusterRoleBinding`) are not namespace-bound and may need to be explicitly cleaned up if the helper creating them does not already register a cleanup hook via `t.T().Cleanup(...)`. + +## Tags + +All tests **must** declare a tag -- this is mandatory. Apply it as the first statement so tests are skipped early when `TEST_TIER` is set: + +| Tag | When to use | +|-----|-------------| +| `Smoke` | Minimal deployment verification | +| `Tier1`–`Tier3` | Progressively deeper coverage | +| `Gpu(accelerator)` | Requires at least one GPU node | +| `MultiGpu(accelerator, n)` | Requires n GPUs per node | +| `MultiNode(n)` | Requires n worker nodes | +| `MultiNodeGpu(n, accelerator)` | Requires n nodes each with at least one GPU | +| `MultiNodeMultiGpu(n, accelerator, gpus)` | Requires n nodes each with at least gpus GPUs | + +## Environment variables + +Declare env var constants and getter functions in `tests/common/support/environment.go`. Never use `os.Getenv` directly in test files — always go through a getter. + +## Editing notebooks + +Test notebooks (`tests/**/resources/*.ipynb`) use 1-space JSON indentation with no trailing newline. When editing notebook cells, preserve the array-of-lines source format — do not collapse source arrays into single strings: + +```json +// Good — array of lines, readable in raw JSON +"source": [ + "import os\n", + "print('hello')" +] + +// Bad — single string, hard to read in raw JSON +"source": "import os\nprint('hello')" +``` + +If a tool (e.g. `NotebookEdit`) converts the edited cell's source to a single string, convert it back to array-of-lines before committing. You can use a Python script: + +```python +import json +with open(path, encoding="utf-8") as f: + nb = json.load(f) +for cell in nb["cells"]: + if isinstance(cell["source"], str): + cell["source"] = cell["source"].splitlines(True) + # Ensure last line has no trailing newline (notebook convention) + if cell["source"] and cell["source"][-1].endswith("\n"): + cell["source"][-1] = cell["source"][-1][:-1] +with open(path, "w", encoding="utf-8") as f: + json.dump(nb, f, indent=1, ensure_ascii=False) +``` + +## Key support library files + +See the [update-support-lib skill](../update-support-lib/SKILL.md) for the full file map. The most frequently used files when writing tests: `test.go` (Test interface), `client.go` (API clients), `environment.go` (env var getters), and the per-API helpers (`trainjob.go`, `pytorchjob.go`, `ray.go`, `kueue.go`). diff --git a/.cursor/rules/update-support-lib.mdc b/.cursor/rules/update-support-lib.mdc new file mode 100644 index 000000000..4d66b336f --- /dev/null +++ b/.cursor/rules/update-support-lib.mdc @@ -0,0 +1,243 @@ +--- +description: "Guide for modifying the shared test support library" +globs: "tests/common/support/**/*.go" +alwaysApply: false +--- + +# Update Support Library + +Guide for modifying the shared test support library at `tests/common/support/`. + +## File organization + +| File | Domain | +|------|--------| +| `test.go` | Test interface (`With(t)`, `Ctx()`, `Client()`, `NewTestNamespace()`) | +| `client.go` | Client interface with 13 API accessors (Core, Trainer, Kubeflow, Ray, etc.) | +| `namespace.go` | Namespace creation, cleanup, log/event capture | +| `environment.go` | Environment variable constants and typed getter functions | +| `defaults.go` | Hardcoded default image versions and fallback values | +| `core.go` | Pod, ConfigMap, Secret, PVC helpers | +| `trainjob.go` | TrainJob getters and condition checkers | +| `pytorchjob.go` | PyTorchJob getters and condition checkers | +| `ray.go` | RayJob/RayCluster helpers | +| `kueue.go` | ResourceFlavor, ClusterQueue, LocalQueue helpers | +| `conditions.go` | Generic Kubernetes condition evaluation | +| `events.go` | Event capture and formatting for debugging | +| `rbac.go` | Role/RoleBinding creation | +| `accelerator.go` | GPU node detection | +| `fakeclient.go` | Fake client setup for unit tests (`NewTest(t)`) | + +## Async getter pattern + +Resource getters return a closure for use with `test.Eventually(...)`: + +```go +func TrainJob(t Test, namespace, name string) func(g gomega.Gomega) *trainerv1alpha1.TrainJob { + return func(g gomega.Gomega) *trainerv1alpha1.TrainJob { + job, err := t.Client().Trainer().TrainerV1alpha1().TrainJobs(namespace).Get(t.Ctx(), name, metav1.GetOptions{}) + g.Expect(err).NotTo(gomega.HaveOccurred()) + return job + } +} +``` + +Follow this pattern when adding getters for new resource types. The outer function captures the test context; the inner function is retried by gomega. + +## Resource creation pattern + +```go +func CreateMyResource(t Test, namespace string, content map[string][]byte) *corev1.MyResource { + t.T().Helper() + + resource := &corev1.MyResource{ + TypeMeta: metav1.TypeMeta{ + APIVersion: corev1.SchemeGroupVersion.String(), + Kind: "MyResource", + }, + ObjectMeta: metav1.ObjectMeta{ + GenerateName: "my-resource-", + Namespace: namespace, + }, + // ... fields + } + + resource, err := t.Client().Core().CoreV1().MyResources(namespace).Create(t.Ctx(), resource, metav1.CreateOptions{}) + t.Expect(err).NotTo(gomega.HaveOccurred()) + t.T().Logf("Created MyResource %s/%s successfully", namespace, resource.Name) + + return resource +} +``` + +Key conventions: +- Always call `t.T().Helper()` first +- Use `GenerateName`, never fixed `Name` +- Assert errors with `t.Expect(err).NotTo(gomega.HaveOccurred())` +- Log the created resource name + +## Condition checker pattern + +```go +func MyResourceConditionReady(resource *v1alpha1.MyResource) metav1.ConditionStatus { + return MyResourceCondition(resource, v1alpha1.MyResourceReady) +} + +func MyResourceCondition(resource *v1alpha1.MyResource, conditionType string) metav1.ConditionStatus { + for _, condition := range resource.Status.Conditions { + if string(condition.Type) == conditionType { + return condition.Status + } + } + return metav1.ConditionUnknown +} +``` + +Create one exported function per condition type (Ready, Failed, Complete, etc.) that delegates to a generic condition extractor. + +## Option pattern + +Used for flexible configuration of namespace, PVC, and other resources: + +```go +type Option[T any] interface { + ApplyTo(to T) error +} + +type ErrorOption[T any] func(to T) error +func (f ErrorOption[T]) ApplyTo(to T) error { return f(to) } +``` + +Example - adding a label to a namespace: + +```go +func WithKueueManaged() Option[*corev1.Namespace] { + return ErrorOption[*corev1.Namespace](func(ns *corev1.Namespace) error { + if ns.Labels == nil { + ns.Labels = make(map[string]string) + } + ns.Labels["kueue.x-k8s.io/managed"] = "true" + return nil + }) +} +``` + +Options are applied via a loop before the API call: + +```go +for _, option := range options { + t.Expect(option.ApplyTo(resource)).To(gomega.Succeed()) +} +``` + +## Adding a new API client + +To add a client for a new Kubernetes API: + +1. **Add the import** in `client.go`: + ```go + newclient "github.com/org/project/pkg/client/clientset/versioned" + ``` + +2. **Extend the `Client` interface**: + ```go + NewAPI() newclient.Interface + ``` + +3. **Add a field to `testClient` struct**: + ```go + newAPI newclient.Interface + ``` + +4. **Add the accessor method**: + ```go + func (t *testClient) NewAPI() newclient.Interface { return t.newAPI } + ``` + +5. **Initialize in `newTestClient()`** (in `test.go`): + ```go + newAPI, err := newclient.NewForConfig(cfg) + // handle error + ``` + +6. **Update `fakeclient.go`** to include the new client for unit tests. + +7. **Run `go mod tidy`** to pull the new dependency. + +## Adding environment variables + +Follow the constant + getter pattern in `environment.go`: + +```go +const ( + MyNewVar = "MY_NEW_VAR" +) + +func GetMyNewVar(t Test) string { + t.T().Helper() + return lookupEnvOrDefault(t, MyNewVar, "default-value") +} +``` + +For training images that support operator-injected defaults, use the three-level resolution in `defaults.go`: +1. Test env var (e.g., `TEST_TRAINING_CUDA_PYTORCH_28_IMAGE`) +2. Operator `RELATED_IMAGE_*` env var +3. Hardcoded default in `defaults.go` + +## Writing unit tests + +Use `NewTest(t)` from `fakeclient.go` to create a test context with fake clients: + +```go +func TestMyHelper(t *testing.T) { + test := NewTest(t) + + // Create test fixtures via fake client + resource := &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-pod", + Namespace: "test-namespace", + }, + } + test.client.Core().CoreV1().Pods("test-namespace").Create(test.ctx, resource, metav1.CreateOptions{}) + + // Call the function under test + result := GetPods(test, "test-namespace", metav1.ListOptions{}) + + // Assert + test.Expect(result).Should(gomega.HaveLen(1)) + test.Expect(result[0].Name).To(gomega.Equal("test-pod")) +} +``` + +See `core_test.go`, `trainjob_test.go`, `environment_test.go` for more examples. + +## Per-suite extensions + +Put helpers in per-suite `support.go` (e.g., `tests/trainer/support.go`) when they: +- Use embedded test resources specific to that suite +- Reference suite-specific APIs or configurations +- Would not be useful to other test suites + +Put helpers in `tests/common/support/` when they: +- Work with standard Kubernetes or shared custom resources +- Could be reused across multiple test suites + +## Validation + +```bash +make unit-test # Run all support lib unit tests +make golangci-lint LINT_PKG=./tests/common/support/... # Lint the support package +go vet ./tests/common/support/... # Vet the support package +make verify-imports # Verify import ordering +``` + +## Checklist + +- [ ] New helpers follow the async getter or resource creation pattern +- [ ] `GenerateName` used for all created resources +- [ ] `t.T().Helper()` called at the top of every helper function +- [ ] Unit tests added in a corresponding `_test.go` file +- [ ] `make unit-test` passes +- [ ] `make golangci-lint LINT_PKG=./tests/common/support/...` passes +- [ ] `make verify-imports` passes diff --git a/AGENTS.md b/AGENTS.md index 1f3da2fd6..0a5529ea9 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -2,6 +2,8 @@ E2E test suite for distributed workloads on RHOAI covering KFTO v1, Trainer v2, and KubeRay, plus training examples and runtime/test images. Built with Go, Python, Kubernetes, Ray, PyTorch. +See [ARCHITECTURE.md](ARCHITECTURE.md) for the full repository structure including test suites, images, benchmarks, and examples. + ## Structure - `tests/` - E2E test suites (Go) @@ -51,111 +53,45 @@ make precommit # Run all pre-commit hooks ### Targeted lint/format -For quick feedback on specific files instead of running project-wide: - ```bash -# Go -make golangci-lint LINT_PKG=./tests/common/support/... # Lint a single Go package -go vet ./tests/common/support/... # Vet a single Go package -gofmt -w path/to/file.go # Format a single Go file - -# Python -pre-commit run --files path/to/file.py # Run all hooks on a single file - +make golangci-lint LINT_PKG=./path/to/package/... # Lint a single Go package +go vet ./path/to/package/... # Vet a single Go package +gofmt -w path/to/file.go # Format a single Go file +pre-commit run --files path/to/file.py # Run all hooks on a single file ``` ## Writing Tests -### Namespace isolation - -Every test must operate in its own dedicated namespace. Use `test.NewTestNamespace()` — it creates a uniquely named namespace and registers automatic cleanup (log collection + deletion) via `t.Cleanup`: - -```go -namespace := test.NewTestNamespace().Name -``` - -Never use a fixed namespace name unless driven by an env var for a specific scenario (e.g., pre-upgrade/post-upgrade tests). Shared namespaces cause interference between tests. - -### Resource naming - -All Kubernetes resources must use `GenerateName` instead of a fixed `Name` to avoid collisions: +See [`.claude/skills/add-e2e-test/SKILL.md`](.claude/skills/add-e2e-test/SKILL.md) for the full guide on writing E2E tests (namespace isolation, resource naming, cleanup, tags, notebook editing, environment variables). -```go -// Good -ObjectMeta: metav1.ObjectMeta{GenerateName: "test-trainjob-"} +## Benchmarks -// Bad -ObjectMeta: metav1.ObjectMeta{Name: "my-trainjob"} -``` - -### Cleanup - -Namespace-scoped resources are deleted automatically when the test namespace is cleaned up. Cluster-scoped resources (e.g., `ClusterRole`, `ClusterRoleBinding`) are not namespace-bound and may need to be explicitly cleaned up if the helper creating them does not already register a cleanup hook via `t.T().Cleanup(...)`. - -### Test structure - -```go -func TestMyFeature(t *testing.T) { - Tags(t, Tier1) // 1. tag / skip checks - test := With(t) // 2. create test context +See [`.claude/skills/add-benchmark/SKILL.md`](.claude/skills/add-benchmark/SKILL.md) for the guide on adding new benchmarks (Dockerfile, ClusterTrainingRuntime, TrainJob, CI workflow). - namespace := test.NewTestNamespace().Name // 3. isolated namespace +## Support Library - // 4. create resources with GenerateName - // 5. ensure cleanup of cluster-scoped resources - // 6. assert with test.Eventually(...) -} -``` - -### Editing notebooks +See [`.claude/skills/update-support-lib/SKILL.md`](.claude/skills/update-support-lib/SKILL.md) for the guide on modifying the shared test support library (getters, condition checkers, client abstraction, option pattern). -Test notebooks (`tests/**/resources/*.ipynb`) use 1-space JSON indentation with no trailing newline. When editing notebook cells, preserve the array-of-lines source format — do not collapse source arrays into single strings: +## Common Workflows -```json -// Good — array of lines, readable in raw JSON -"source": [ - "import os\n", - "print('hello')" -] +The most frequent tasks in this repo, based on commit history: -// Bad — single string, hard to read in raw JSON -"source": "import os\nprint('hello')" -``` - -If a tool (e.g. `NotebookEdit`) converts the edited cell's source to a single string, convert it back to array-of-lines before committing. You can use a Python script: - -```python -import json -with open(path, encoding="utf-8") as f: - nb = json.load(f) -for cell in nb["cells"]: - if isinstance(cell["source"], str): - cell["source"] = cell["source"].splitlines(True) - # Ensure last line has no trailing newline (notebook convention) - if cell["source"] and cell["source"][-1].endswith("\n"): - cell["source"][-1] = cell["source"][-1][:-1] -with open(path, "w", encoding="utf-8") as f: - json.dump(nb, f, indent=1, ensure_ascii=False) -``` +- **CVE-driven Python dependency updates** -- updating a single dependency across training image variants (see CVE Fixes below) +- **Adding E2E tests** -- see [Writing Tests](#writing-tests) +- **Adding benchmarks** -- see [Benchmarks](#benchmarks) +- **Updating the support library** -- see [Support Library](#support-library) -### Environment variables +Commit message format for JIRA-tracked work: `RHOAIENG-NNNNN: in ` -Declare env var constants and getter functions in `tests/common/support/environment.go`. Never use `os.Getenv` directly in test files — always go through a getter. +## CVE Fixes -- Python dependency updates -### Tags +Two image families with different dependency management: -Tests in `tests/trainer/` **must** declare a tag — this is mandatory. Apply it as the first statement so tests are skipped early when `TEST_TIER` is set: +- **Runtime training images** (`images/runtime/training/`) use `Pipfile`/`Pipfile.lock` (pipenv) and pull from public PyPI. See [images/runtime/training/README.md](images/runtime/training/README.md). +- **Universal training images** (`images/universal/training/`) use `pyproject.toml`/`requirements.txt` (pip) and pull from a **private AIPCC PyPI index** -- always query the index for available versions before pinning. See [images/universal/training/README.md](images/universal/training/README.md#cve-fixes--python-dependency-updates). -| Tag | When to use | -|-----|-------------| -| `Smoke` | Minimal deployment verification | -| `Tier1`–`Tier3` | Progressively deeper coverage | -| `Gpu(accelerator)` | Requires at least one GPU node | -| `MultiGpu(accelerator, n)` | Requires n GPUs per node | -| `MultiNode(n)` | Requires n worker nodes | -| `MultiNodeGpu(n, accelerator)` | Requires n nodes each with at least one GPU | -| `MultiNodeMultiGpu(n, accelerator, gpus)` | Requires n nodes each with at least gpus GPUs | +Each image variant is updated independently with its own commit. -## CVE Fixes — Python dependency updates +## AI Agent Skills -See [images/universal/training/README.md](images/universal/training/README.md#cve-fixes--python-dependency-updates) for instructions on updating Python dependencies in training images. Key point: dependencies come from a private AIPCC PyPI index, not public PyPI — always query the index for available versions before pinning. +`.claude/skills/` is the canonical source for AI agent skills. Run `make sync-agent-skills` after editing any skill to sync to other tools (Cursor, etc.). diff --git a/ARCHITECTURE.md b/ARCHITECTURE.md new file mode 100644 index 000000000..e1e7c71d6 --- /dev/null +++ b/ARCHITECTURE.md @@ -0,0 +1,170 @@ +# Architecture + +E2E test suite for distributed workloads on Red Hat OpenShift AI (RHOAI), covering Kubeflow Training Operator v1 (KFTO), Kubeflow Trainer v2, and KubeRay. + +## Test suites + +```text +tests/ +├── kfto/ KFTO v1 — PyTorchJob-based distributed training +├── trainer/ Kubeflow Trainer v2 — TrainJob / JobSet-based training +├── odh/ KubeRay — Ray cluster and RayJob-based training +├── fms/ Foundation model fine-tuning (fms-hf-tuning) +│ ├── kfto/ via KFTO PyTorchJob +│ └── trainer/ via Trainer v2 TrainJob +└── common/ Shared test infrastructure + └── support/ Client abstractions, resource helpers, test lifecycle +``` + +### kfto — Kubeflow Training Operator v1 + +Tests PyTorchJob-based distributed training using the legacy Kubeflow Training Operator. Covers MNIST training (single/multi-node, single/multi-GPU), LLM supervised fine-tuning (SFT), Kueue integration, SDK usage, and upgrade scenarios. + +### trainer — Kubeflow Trainer v2 + +Tests TrainJob-based distributed training using the modern Kubeflow Trainer v2. Covers PyTorch DDP (Fashion MNIST, multi-node/multi-GPU), MPI jobs (OpenMPI), Kueue integration, TrainingRuntime/ClusterTrainingRuntime, Kubeflow SDK, and upgrade scenarios. This is the primary and most actively developed test suite. + +### odh — KubeRay + +Tests Ray-based distributed training via RayCluster and RayJob. Covers MNIST training, Ray Tune hyperparameter optimization, and LLM fine-tuning with DeepSpeed. + +### fms — Foundation model fine-tuning + +Tests the fms-hf-tuning container image for LLM fine-tuning (SFT, LoRA, QLoRA) through two parallel orchestration paths: KFTO PyTorchJob (`fms/kfto/`) and Trainer v2 TrainJob (`fms/trainer/`). Both paths test the same training workload with different orchestration, validating that fms-hf-tuning works correctly under each framework. Includes S3 data staging via batch jobs. + +## Suite relationships + +- **kfto** is the legacy operator; **trainer** is its modern replacement. Both test PyTorch distributed training but via different CRDs (PyTorchJob vs TrainJob). +- **fms** tests the same fms-hf-tuning workload via both kfto and trainer, ensuring parity across orchestration frameworks. +- **odh** covers Ray-based parallelism, complementing the PyTorch-based kfto and trainer suites. + +## Shared support library + +`tests/common/support/` provides the test infrastructure used by all suites (~40 files). + +### Test lifecycle + +- **`test.go`** — `Test` interface: wraps `*testing.T` with gomega assertions (`Eventually`, `Expect`), context management, and namespace helpers. +- **`namespace.go`** — `NewTestNamespace()`: creates an isolated namespace per test with automatic cleanup (pod log collection, event capture, namespace deletion) via `t.Cleanup`. + +### Client abstraction + +- **`client.go`** — `Client` interface: lazy-initialized accessor for multiple Kubernetes API clients: + - Core Kubernetes, Dynamic, Storage + - Kubeflow Training Operator (`kubeflowclient`) + - Kubeflow Trainer v2 (`trainerclient`) + - KubeRay (`rayclient`) + - Kueue (`kueueclient`), Kueue Operator + - JobSet (`jobsetclient`) + - OpenShift Machine API, Routes, ImageStreams + - OLM (Operator Lifecycle Manager) + +### Per-API resource helpers + +Each distributed workload API has a dedicated helper file with getters, condition checkers, and builders: + +| File | API | +|------|-----| +| `pytorchjob.go` | PyTorchJob (Running, Succeeded, Failed, Suspended) | +| `trainjob.go` | TrainJob (Complete, Failed, Suspended) | +| `ray.go` | RayJob, RayCluster (status, logs) | +| `kueue.go` | ResourceFlavor, ClusterQueue, LocalQueue, workload admission | +| `jobset.go` | JobSet resources | + +### Other shared utilities + +| File | Purpose | +|------|---------| +| `environment.go` | Environment variable getters (never use `os.Getenv` directly) | +| `core.go` | Pod, ConfigMap, Secret helpers | +| `rbac.go` | Role / RoleBinding creation for test isolation | +| `conditions.go` | Kubernetes condition evaluation | +| `events.go` | Event capture for debugging | +| `accelerator.go` | GPU node detection | + +### Per-suite extensions + +Each suite has a `support.go` that imports `tests/common/support` and adds suite-specific utilities (e.g., embedded test resource files via `//go:embed`, Prometheus queries for GPU utilization). Per-suite files extend — never wrap — the common `Test` interface. + +### Common utilities outside support/ + +`tests/common/` (outside `support/`) provides cross-suite utilities: + +| File | Purpose | +|------|---------| +| `test_tag.go` | Tag functions (`Smoke`, `Tier1`–`Tier3`, `Gpu`, `MultiNode`, etc.) and `Tags()` helper for test filtering | +| `environment.go` | Shared env var getters (test tier, notebook config, HuggingFace token) | +| `notebook.go` | Notebook creation with GPU allocation and Kueue integration | +| `template.go` | Go template parsing for dynamic Kubernetes manifests | + +## Benchmarks + +```text +benchmarks/ +├── kftv2-mpi-ddp-sft/ MPI DDP SFT training (Qwen 2.5 + GSM8K) +│ ├── README.md +│ ├── mpi-runtime.yaml ClusterTrainingRuntime +│ ├── train_sft_ddp.py Training script (mounted via ConfigMap) +│ └── trainjob.yaml TrainJob manifest +└── osu-benchmarks/ OSU MPI micro-benchmarks (point-to-point + collective) + ├── Dockerfile CPU variant + ├── Dockerfile.cuda CUDA variant + ├── mpi-runtime-cpu.yaml ClusterTrainingRuntime (CPU) + ├── mpi-runtime-gpu.yaml ClusterTrainingRuntime (GPU) + ├── osu-trainjob-cpu.yaml TrainJob (CPU) + ├── osu-trainjob-gpu.yaml TrainJob (GPU) + └── uid_entrypoint.sh UID entrypoint for OpenShift +``` + +Each benchmark defines a **ClusterTrainingRuntime** (MPI execution environment) and a **TrainJob** (workload submission). See the [add-benchmark skill](.claude/skills/add-benchmark/SKILL.md) for the full guide. + +## Images + +```text +images/ +├── dataset/ +│ └── alpaca/ Alpaca dataset image +├── model/ +│ └── bloom560m/ BLOOM-560M model image +├── runtime/ +│ ├── training/ Runtime training images (~10 variants) +│ │ ├── py311-cuda121-torch241/ +│ │ ├── py311-cuda124-torch251/ +│ │ ├── ... +│ │ └── py312-rocm64-torch290/ +│ ├── ray/ Ray runtime images +│ │ ├── cuda/ CUDA variants +│ │ └── rocm/ ROCm variants +│ └── examples/ Example-specific runtime images +├── universal/ +│ └── training/ Universal training images (3 variants) +│ ├── th06-cpu-torch210-py312/ +│ ├── th06-cuda130-torch210-py312/ +│ └── th06-rocm64-torch291-py312/ +├── tests/ Test runner image +└── util/ + └── mc-cli/ MinIO client utility image +``` + +Key distinction for dependency management (matters for CVE fixes): + +- **Runtime training images** (`images/runtime/training/`) use `Pipfile`/`Pipfile.lock` (pipenv) and pull from public PyPI. Two openmpi41 variants are exceptions that use `pyproject.toml`/`requirements.txt` instead. See `images/runtime/training/README.md`. +- **Universal training images** (`images/universal/training/`) use `pyproject.toml`/`requirements.txt` (pip) and pull from a private AIPCC PyPI index. See `images/universal/training/README.md`. + +## Examples + +```text +examples/ +├── hpo-raytune/ Ray Tune HPO on OpenShift AI +├── kfto-dreambooth/ Stable Diffusion DreamBooth with KFTO +├── kfto-feast/ Fine-tuning with Feast feature store +├── kfto-sft-feast-rag/ SFT + Feast + RAG pipeline +├── kfto-sft-llm/ LLM SFT with KFTO +├── kfto_feast_rag/ End-to-end RAG with Feast + Milvus +├── rag-llm/ RAG with HuggingFace + sentence-transformers +├── ray-docling/ Batch document processing with Ray + Docling +├── ray-finetune-llm-deepspeed/ LLM fine-tuning with Ray + DeepSpeed +└── stable-diffusion-dreambooth/ Stable Diffusion DreamBooth (standalone) +``` + +Each example contains a README, one or more Jupyter notebooks, and supporting resources (datasets, configs, Kubernetes manifests). diff --git a/Makefile b/Makefile index 9e2a97301..3f291ec96 100644 --- a/Makefile +++ b/Makefile @@ -85,3 +85,7 @@ golangci-lint: golangci-lint-install ## Run golangci-lint on the codebase. .PHONY: precommit precommit: pre-commit run --all-files + +.PHONY: sync-agent-skills +sync-agent-skills: ## Sync AI agent skills from .claude/skills/ to other tools (Cursor, etc.) + @./hack/sync-agent-skills.sh diff --git a/hack/sync-agent-skills.sh b/hack/sync-agent-skills.sh new file mode 100755 index 000000000..ff3b6fb6a --- /dev/null +++ b/hack/sync-agent-skills.sh @@ -0,0 +1,53 @@ +#!/usr/bin/env bash +set -euo pipefail + +SKILLS_DIR=".claude/skills" +CURSOR_RULES_DIR=".cursor/rules" + +get_description() { + case "$1" in + add-e2e-test) echo "Guide for adding E2E tests to the distributed-workloads repo" ;; + add-benchmark) echo "Guide for adding benchmarks to benchmarks/" ;; + update-support-lib) echo "Guide for modifying the shared test support library" ;; + *) echo "$1" ;; + esac +} + +get_globs() { + case "$1" in + add-e2e-test) echo "tests/**/*.go" ;; + add-benchmark) echo "benchmarks/**/*" ;; + update-support-lib) echo "tests/common/support/**/*.go" ;; + *) echo "" ;; + esac +} + +sync_cursor() { + mkdir -p "$CURSOR_RULES_DIR" + + for skill_dir in "$SKILLS_DIR"/*/; do + name=$(basename "$skill_dir") + skill_file="$skill_dir/SKILL.md" + [ -f "$skill_file" ] || continue + + desc=$(get_description "$name") + glob=$(get_globs "$name") + out="$CURSOR_RULES_DIR/$name.mdc" + + { + echo "---" + echo "description: \"$desc\"" + [ -n "$glob" ] && echo "globs: \"$glob\"" + echo "alwaysApply: false" + echo "---" + echo "" + cat "$skill_file" + } > "$out" + + echo " cursor: $out" + done +} + +echo "Syncing skills from $SKILLS_DIR ..." +sync_cursor +echo "Done."